In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from os.path import join
import numpy as np

In [2]:
vocab_dir = "snp_data"
prefix = "chr_diploid"

vocab_filename = join(vocab_dir, "{}-vocab.json".format(prefix))
merges_filename = join(vocab_dir, "{}-merges.txt".format(prefix))
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
bpe = BPE(vocab, merges)
tokenizer = Tokenizer(bpe)

In [3]:
max_len=4096

PAD = "$"
MASK = "."
UNK = "?"
SEP = "|"
CLS = "*"

In [4]:
tokenizer.add_tokens([PAD,MASK,UNK,SEP,CLS])

5

In [6]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer,
                                        model_max_length=max_len,
                                        pad_token=PAD,
                                        mask_token=MASK,
                                        unk_token=UNK,
                                        sep_token=SEP,
                                        cls_token=CLS,
                                        padding_side="right",
                                        add_special_tokens=True )

In [7]:
fast_tokenizer.save_pretrained("fast_tokenizer")

('fast_tokenizer\\tokenizer_config.json',
 'fast_tokenizer\\special_tokens_map.json',
 'fast_tokenizer\\tokenizer.json')

In [8]:
fast_tokenizer.vocab

{'ACAAG': 466,
 'AAATGG': 547,
 'TCAGAGG': 3073,
 'TTTTTTTTTTTTTTTTTTTT</w>': 20845,
 'AGGTTCAAGCG': 11657,
 'TCGGGGG': 9196,
 'AGATATGCC': 20171,
 'TATTTCAAC': 21427,
 'TATTCTCTC': 13115,
 'TCCCGAGTTC': 14303,
 'CCCAGGCTGGAGTGCAGTGGC': 19338,
 'ATCTGCC': 1764,
 'TCAGTATT': 5488,
 'TGTGTTTG': 7921,
 'TGTGGGCC': 6240,
 'CTTGCC': 19998,
 'AGCACAGG': 10657,
 'AAGTTATG': 13699,
 'AATATTG': 1498,
 'AGTGTTGG': 8275,
 'TTTGGGCC': 6180,
 'AGAGGAGAA': 18357,
 'AGGTAGCC': 21241,
 'TTCTGCCC': 7491,
 'TTTGGATG': 13728,
 'ACATCTAA': 19264,
 'AGGGCATCCC': 10849,
 'AGCATAG': 11758,
 'ACAAAAGCC': 15367,
 'ATTATAAAA': 5957,
 'TGACG': 1832,
 'AGTTTT</w>': 12151,
 'AAAGGAGGC': 16491,
 'ACAGAGCAAGAC': 17075,
 'ACAGGGGC': 7069,
 'TGTTATCC': 9294,
 'AATTAATTG': 15213,
 'ATCAGTGG': 10479,
 'TAACACTT': 20888,
 'AATTTAAG': 4865,
 'TTTTTAAAA': 9074,
 'ACTTAAAA': 3323,
 'AACTGGG': 5826,
 'ATTCAATTC': 18680,
 'ATGGTAC': 3624,
 'ATGTGTAT': 11730,
 'TCCATT': 1844,
 'AAGTGATCCACCTGCC': 14059,
 'AGGATGCC': 13286,
 'A

In [9]:
fast_tokenizer.pad_token_id

22000

In [10]:
'?' in fast_tokenizer.vocab

True

In [11]:
fast_tokenizer.unk_token_id

22002

In [8]:
seq_dir = "sample_30000.txt"
with open(seq_dir, 'r',encoding='utf-8') as file:
    seq_list = []
    for line in file:
        line1=line.strip()
        if line1!='':
            seq_list.append(line1)

In [9]:
seq0=seq_list[5]
retval = fast_tokenizer.encode(seq0,truncation=True, max_length=max_len)

In [11]:
fast_tokenizer.unk_token

'?'

In [None]:
fast_tokenizer.convert_tokens_to_ids(fast_tokenizer.unk_token)

In [9]:
len(seq0)

2410

In [10]:
len(retval)

37

In [15]:
retval

[20601,
 1681,
 3408,
 3339,
 3259,
 8182,
 4887,
 7900,
 2217,
 1620,
 296,
 29,
 679,
 1294,
 15741,
 59,
 218,
 1136,
 3,
 23,
 3431,
 227,
 15574,
 993,
 782,
 11251,
 2214,
 1311,
 273,
 4504,
 15,
 57,
 210,
 3910,
 10210,
 14446,
 4004,
 2565,
 856,
 293,
 296,
 576,
 13905,
 2240,
 332,
 3093,
 216,
 5859,
 359,
 236,
 2497,
 695,
 1027,
 419,
 16580,
 1510,
 3301,
 262,
 3103,
 256,
 233,
 307,
 23,
 3059,
 869,
 6317,
 838,
 811,
 296,
 896,
 315,
 3240,
 2093,
 17258,
 536,
 393,
 65,
 5,
 528,
 702,
 377,
 226,
 735,
 92,
 228,
 1841,
 345,
 4796,
 795,
 219,
 17145,
 365,
 836,
 181,
 65,
 9,
 797,
 3758,
 2521,
 253,
 65,
 1552,
 13780,
 918,
 1717,
 416,
 309,
 1781,
 1116,
 5131,
 11271,
 9298,
 188,
 3,
 18,
 18,
 18,
 196,
 1481,
 196,
 3659,
 1044,
 5676,
 472,
 1554,
 441,
 349,
 8880,
 467,
 478,
 567,
 6019,
 18464,
 494,
 243,
 329,
 341,
 2863,
 490,
 282,
 2247,
 199,
 1060,
 209,
 23,
 5,
 1342,
 424,
 1030,
 664,
 672,
 4080,
 727,
 222,
 2230,
 218,
 441,
 2

In [4]:
from transformers import BertTokenizer

In [16]:
tokenizer_dir = "bert-SNP"
tokenizer.save(tokenizer_dir)

bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_dir,
                                                    do_lower_case=True,
                                                    do_basic_tokenize=True,
                                                    tokenize_chinese_chars=True,
                                                    pad_token=PAD,
                                                    mask_token=MASK,
                                                    unk_token=UNK,
                                                    sep_token=SEP,
                                                    cls_token=CLS,
                                                    padding_side="right"
                                                    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
import json

In [10]:
with open(vocab_filename, 'r',encoding='utf-8') as file:
    vocab_dict = json.load(file)
token_list=vocab_dict.keys()

In [11]:
token_list

dict_keys(['[UNK]', '[CLS]', '[SEP]', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'N', 'O', 'P', 'T', 'U', 'V', '一', '丁', '丂', '七', '丄', '丅', '丆', '万', '丈', '三', '上', '下', '丌', '不', '与', '丏', '丐', '丑', '丒', '专', '且', '丕', '世', '丗', '丘', '丙', '业', '丛', '东', '丝', '丞', '丟', '丠', '両', '丢', '丣', '两', '严', '並', '丧', '丨', '丩', '个', '丫', '丬', '中', '丮', '丯', '丰', '丱', '串', '丳', '临', '丵', '丶', '丷', '丸', '丹', '为', '主', '丼', '丽', '举', '丿', '乀', '乁', '乂', '乃', '乄', '久', '乆', '乇', '么', '义', '乊', '之', '乌', '乍', '乎', '乏', '乐', '乑', '乒', '乓', '乔', '乕', '乖', '乗', '乘', '乙', '乚', '乛', '乜', '九', '乞', '也', '习', '乡', '乢', '乣', '乤', '乥', '书', '乧', '乨', 'C</w>', 'A</w>', 'G</w>', 'T</w>', 'N</w>', '一</w>', '丄</w>', '丂</w>', '九</w>', '万</w>', '串</w>', '丨</w>', '两</w>', '丁</w>', '乎</w>', '乨</w>', '丕</w>', '丳</w>', '丼</w>', '丩</w>', '乌</w>', '丘</w>', '丹</w>', '久</w>', '乊</w>', '乗</w>', '七</w>', '丌</w>', '之</w>', '乍</w>', '丫</w>', '乧</w>', '丆</w>', '乆</w>', '丅</w>', '丰</w>', '丒</w>', '乑</w>', '乇</w>', '丈</w>', '业