## Imports

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

### Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)

In [4]:
dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
inputs = tokenizer(dna, return_tensors = 'pt')["input_ids"]

In [5]:
inputs

tensor([[   1,    5,  194,   32,  757, 1239, 2092,  294,   24,  359,   88,   93,
           32,   75,   77,   19,    2]])

In [6]:
tokenizer.decode(inputs[0])

'[CLS] A CGTA GCA TCGGA TCTATCTA TCGACA CTTGG TTA TCGA TCTA CGA GCA TCTC GTTA GC [SEP]'

### DNABERT-2-117M

> need to load the model from specific commit which solves the issue https://huggingface.co/zhihan1996/DNABERT-2-117M/commit/6617c7e3829423fddd80ba03c7c7dc4f8aab4d19 I've been having otherwise -> revision can be ommited if the PR is accepted

In [7]:
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, revision='6617c7e')

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
hidden_states = model(inputs)[0]
hidden_states

tensor([[[-0.0458,  0.0782,  0.1223,  ...,  0.2533,  0.1660,  0.0863],
         [-0.0590, -0.0850,  0.1442,  ...,  0.2694,  0.0734, -0.0645],
         [-0.2030,  0.2774,  0.0958,  ..., -0.1426,  0.1620,  0.1039],
         ...,
         [-0.0018, -0.0709,  0.1182,  ...,  0.1514, -0.2617,  0.1708],
         [-0.0510,  0.0114,  0.1349,  ..., -0.1366, -0.0012,  0.2496],
         [ 0.0246,  0.2306,  0.1297,  ...,  0.1221,  0.1937, -0.0584]]],
       grad_fn=<ViewBackward0>)

In [9]:
hidden_states.shape

torch.Size([1, 17, 768])

### Load dataset

In [37]:
from datasets import load_dataset

In [38]:
dataset = load_dataset('simecek/Human_DNA_v0')

In [39]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Seq'],
        num_rows: 263659
    })
    test: Dataset({
        features: ['Seq'],
        num_rows: 29296
    })
})

### Arithmetic compression

#### StaticModel

StaticModel: A class which implements a static model that doesn't adapt to input data or statistics.

In [4]:
from arithmetic_compressor import AECompressor
from arithmetic_compressor.models import StaticModel

# create the model
model = StaticModel({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25})

# create the coder
coder = AECompressor(model)

# encode a sequence
data = "ACGTAGC"
N = len(data)
compressed = coder.compress(data)

# print the compressed data
print(compressed) 

[0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1]


In [5]:
decoded = coder.decompress(compressed, N)

print(decoded)

['A', 'C', 'G', 'T', 'A', 'G', 'C']


In [9]:
from arithmetic_compressor.models import\
   PPMModel,\
   MultiPPM

# create the model
model = PPMModel(['A', 'C', 'G', 'T'], k = 3)

# create an arithmetic coder
coder = AECompressor(model)

# encode some data
data = "ACGTAGCACGTATAAAGGCCATA"
N = len(data)
compressed = coder.compress(data)

# print the compressed data
print(compressed) 

[0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [10]:
decoded = coder.decompress(compressed, N)

print(decoded)

['A', 'C', 'G', 'T', 'A', 'G', 'C', 'A', 'C', 'G', 'T', 'A', 'T', 'A', 'A', 'A', 'G', 'G', 'C', 'C', 'A', 'T', 'A']
