In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

In [None]:
data_files={"train":"swerick_data_random_train.pkl","test":"swerick_data_random_test.pkl","valid":"swerick_data_random_valid.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
swerick_dataset

In [None]:
def get_training_corpus():
    for i in range(0, len(swerick_dataset["train"]),1000):
        yield swerick_dataset["train"][i:i+1000]["texte"]

In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [None]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False,strip_accents=False)


In [None]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str(swerick_dataset["train"][100]["texte"])

In [None]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=50325, special_tokens=special_tokens)

In [None]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [None]:
encoding = tokenizer.encode(swerick_dataset["train"][100]["texte"])
print(encoding.tokens)

In [None]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

In [None]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [None]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [None]:
tokenizer.decode(encoding.ids)

In [None]:
tokenizer.save("tokenizer_swerick.json")

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="tokenizer_swerick.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [None]:
#difference de tokenizer 
base_tokenizer = AutoTokenizer.from_pretrained("KBLab/bert-base-swedish-cased")

In [None]:
replace =lambda x :x.replace('##',"")
swerick_voc=list(map(replace,wrapped_tokenizer.vocab.keys()))
base_voc=list(map(replace,base_tokenizer.vocab.keys()))

In [None]:
def get_vocab_sim(first_vocab_keys,second_vocab_keys):
    f_set=set(first_vocab_keys)
    s_set=set(second_vocab_keys)
    intersection = f_set.intersection(s_set)
    union = f_set.union(s_set)

    similarity_jaccard= len(intersection)/len(union)
    vocab_f = f_set-s_set
    return intersection, len(intersection)/len(f_set), len(intersection)/len(s_set),similarity_jaccard,vocab_f

In [None]:
inter,f,s,jaccard,vocab_f=get_vocab_sim(swerick_voc,base_voc)
print(inter)
print(f,s)
print("similarity of Jaccard",jaccard)
print("New Vocab added in tokenizer of swerick", vocab_f)

In [None]:
same_words=0
wrong_words=0
wrong_list=[]
for word in list(swerick_voc):
    base_token = base_tokenizer.tokenize(word)