In [10]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

In [11]:
data_files={"train":"swerick_data_random_train.pkl","test":"swerick_data_random_test.pkl","valid":"swerick_data_random_valid.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
swerick_dataset

DatasetDict({
    train: Dataset({
        features: ['protocole', 'texte'],
        num_rows: 12296
    })
    test: Dataset({
        features: ['protocole', 'texte'],
        num_rows: 2689
    })
    valid: Dataset({
        features: ['protocole', 'texte'],
        num_rows: 2689
    })
})

In [9]:
def get_training_corpus():
    for i in range(0, len(swerick_dataset["train"]),1000):
        yield swerick_dataset["train"][i:i+1000]["texte"]

In [3]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [None]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False,strip_accents=False)


In [None]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str(swerick_dataset["train"][100]["texte"])

In [None]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=50325, special_tokens=special_tokens)

In [None]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [None]:
encoding = tokenizer.encode(swerick_dataset["train"][100]["texte"])
print(encoding.tokens)

In [None]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

In [None]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [None]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [None]:
tokenizer.decode(encoding.ids)

In [None]:
tokenizer.save("tokenizer_swerick.json")

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="tokenizer_swerick.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [15]:
#difference de tokenizer 
base_tokenizer = AutoTokenizer.from_pretrained("KBLab/bert-base-swedish-cased")

In [18]:
replace =lambda x :x.replace('Ġ',"")
swerick_voc=list(wrapped_tokenizer.vocab.keys())
base_voc=list(base_tokenizer.vocab.keys())

In [16]:
def get_vocab_sim(first_vocab_keys,second_vocab_keys):
    f_set=set(first_vocab_keys)
    s_set=set(second_vocab_keys)
    intersection = f_set.intersection(s_set)
    union = f_set.union(s_set)

    similarity_jaccard= len(intersection)/len(union)
    vocab_f = f_set-s_set
    return intersection, len(intersection)/len(f_set), len(intersection)/len(s_set),similarity_jaccard,vocab_f

In [19]:
inter,f,s,jaccard,vocab_f=get_vocab_sim(swerick_voc,base_voc)
print(inter)
print(f,s)
print("similarity of Jaccard",jaccard)
print("New Vocab added in tokenizer of swerick", vocab_f)

{'42', 'dryck', 'stap', '46', 'uppfattningen', 'res', 'vaga', 'kampanj', 'ryss', 'yr', 'spri', 'AHL', 'bron', 'utskottets', 'fastighet', 'EU', 'redskap', 'ration', 'stycke', 'lys', 'fattiga', '29', 'oo', '73', 'forum', 'vard', 'bi', 'rig', 'staterna', 'verk', 'gjorde', 'metaller', 'prop', 'ytan', 'skikt', 'provin', 'ank', 'producenter', 'kla', 'tull', 'marginalen', 'deras', 'fem', 'fastigheter', 'bes', 'sm', 'institutionerna', 'valet', 'strejken', 'fas', 'rapporter', '11', 'bjud', 'skoter', 'A', '220', 'visade', 'viken', 'samt', 'dras', 'byggnaden', 'kille', 'penning', 'CR', 'hing', 'fost', 'stuga', 'fam', 'beskrivning', 'hanteringen', 'anordningar', 'TT', 'domstolen', 'skriften', 'urs', 'H', 'enheten', '225', 'BO', 'em', 'kommunikationer', 'regim', 'gn', 'fil', 'mil', 'gru', 'sken', 'visar', '93', 'maj', 'ali', 'avdelning', 'liggande', '56', 'gens', 'bol', 'Z', 'fasta', 'pat', 'kasse', 'nytta', 'pe', 'slag', 'qu', 'kontroller', 'parten', 'representanter', 'kommiss', 'brott', 'stabil',

In [None]:
same_words=0
wrong_words=0
wrong_list=[]
for word in list(swerick_voc):
    base_token = base_tokenizer.tokenize(word)

In [57]:
from datasets import load_dataset

toy_data =  load_dataset("Abirate/french_book_reviews")

Downloading readme: 100%|██████████| 5.47k/5.47k [00:00<00:00, 15.6MB/s]
Downloading data: 100%|██████████| 4.21M/4.21M [00:01<00:00, 2.87MB/s]
Generating train split: 9658 examples [00:00, 812821.56 examples/s]


In [59]:
toy_data['train'][0]

{'book_title': 'Le Démon de la Colline aux Loups',
 'author': 'Dimitri Rouchon-Borie',
 'reader_review': 'Ce n\'est pas le premier roman à aborder les thèmes lourds de l\'inceste et de l\'enfance martyre, mais il le fait avec une audace et un brio incomparables qui rendent ce livre marquant dans une vie de lecteur. On y sent à quel point l\'auteur n\'a pas cherché à "faire quelque chose", on ne sent jamais l\'intention, on sent juste l\'urgence, incandescente, à raconter la vérité d\'un homme maltraité par la vie au point de dire à la nuit «\xa0 tu ne me feras pas peur j\'ai plus de noir que toi dans mon enfance\xa0».',
 'rating': 5.0,
 'label': 1}

In [39]:
# new tokenizer BPE

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [65]:
print(tokenizer.pre_tokenizer.pre_tokenize_str(toy_data['train'][0]['reader_review']))
encoding = tokenizer.encode(toy_data['train'][0]['reader_review'])
print(tokenizer.decode(encoding.ids))

[('Ce', (0, 2)), ('Ġn', (2, 4)), ("'", (4, 5)), ('est', (5, 8)), ('Ġpas', (8, 12)), ('Ġle', (12, 15)), ('Ġpremier', (15, 23)), ('Ġroman', (23, 29)), ('ĠÃł', (29, 31)), ('Ġaborder', (31, 39)), ('Ġles', (39, 43)), ('ĠthÃ¨mes', (43, 50)), ('Ġlourds', (50, 57)), ('Ġde', (57, 60)), ('Ġl', (60, 62)), ("'", (62, 63)), ('inceste', (63, 70)), ('Ġet', (70, 73)), ('Ġde', (73, 76)), ('Ġl', (76, 78)), ("'", (78, 79)), ('enfance', (79, 86)), ('Ġmartyre', (86, 94)), (',', (94, 95)), ('Ġmais', (95, 100)), ('Ġil', (100, 103)), ('Ġle', (103, 106)), ('Ġfait', (106, 111)), ('Ġavec', (111, 116)), ('Ġune', (116, 120)), ('Ġaudace', (120, 127)), ('Ġet', (127, 130)), ('Ġun', (130, 133)), ('Ġbrio', (133, 138)), ('Ġincomparables', (138, 152)), ('Ġqui', (152, 156)), ('Ġrendent', (156, 164)), ('Ġce', (164, 167)), ('Ġlivre', (167, 173)), ('Ġmarquant', (173, 182)), ('Ġdans', (182, 187)), ('Ġune', (187, 191)), ('Ġvie', (191, 195)), ('Ġde', (195, 198)), ('Ġlecteur', (198, 206)), ('.', (206, 207)), ('ĠOn', (207, 210)),

In [41]:
trainer = trainers.BpeTrainer(vocab_size=52000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [42]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [53]:
!file swerick_data_random_train.pkl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


swerick_data_random_train.pkl: XENIX 8086 relocatable or 80286 small model


In [23]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [62]:
tokenizer.decoder = decoders.ByteLevel()

In [46]:
import os
if not os.path.exists("BPE_swerick_tokenizer"):
 os.makedirs("BPE_swerick_tokenizer")

tokenizer.model.save("BPE_swerick_tokenizer")

['BPE_swerick_tokenizer/vocab.json', 'BPE_swerick_tokenizer/merges.txt']

In [3]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
config = RobertaConfig(
 vocab_size=52_000,
 max_position_embeddings=514,
 num_attention_heads=12,
 num_hidden_layers=12,
 type_vocab_size=1,
)
model = RobertaForMaskedLM(config=config)

In [1]:
context_length=128

In [2]:
from transformers import RobertaTokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('BPE_swerick_tokenizer', max_length=512)

  from .autonotebook import tqdm as notebook_tqdm


In [77]:
print(roberta_tokenizer.tokenize("Herr talman, jag ber att få önska att min motion bordläggs för vidare behandling"))

['Herr', 'Ġtalman', ',', 'Ġjag', 'Ġber', 'Ġatt', 'ĠfÃ¥', 'ĠÃ¶nska', 'Ġatt', 'Ġmin', 'Ġmotion', 'Ġbord', 'lÃ¤ggs', 'ĠfÃ¶r', 'Ġvidare', 'Ġbehandling']


In [66]:
def tokenize(element):
    outputs = roberta_tokenizer(
        element["texte"],
        truncation=False,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    eos_token_id = roberta_tokenizer.eos_token_id  # Utilisez sep_token_id ici
    print(eos_token_id)  # Cela va imprimer l'identifiant numérique du token [SEP]
    concatenated_sequence = []

    # Append each tokenized input with the eos_token_id and flatten into a single list
    for ids in outputs["input_ids"]:
        concatenated_sequence.extend(ids + [eos_token_id])

    # Remove the last eos_token_id if it's at the end of the sequence
    if concatenated_sequence[-1] == eos_token_id:
        concatenated_sequence.pop()

    # Chunk the concatenated sequence into segments of context_length
    input_batch = []
    for i in range(0, len(concatenated_sequence), context_length):
        chunk = concatenated_sequence[i:i + context_length]
        if len(chunk) == context_length:
            input_batch.append(chunk)

    # Return the chunked sequences
    return {"input_ids": input_batch}

# Utilisation de la fonction modifiée pour le mapping
tokenized_datasets = swerick_dataset.map(
    tokenize, batched=True, remove_columns=swerick_dataset["train"].column_names
)

model = RobertaForMaskedLM(config=config).cuda()

Map:   0%|          | 0/12296 [00:00<?, ? examples/s]

52001


Map:   8%|▊         | 1000/12296 [01:33<15:18, 12.30 examples/s]

52001


Map:  16%|█▋        | 2000/12296 [02:53<13:57, 12.29 examples/s]

52001


Map:  24%|██▍       | 3000/12296 [04:07<12:52, 12.03 examples/s]

Map:  24%|██▍       | 3000/12296 [04:18<12:52, 12.03 examples/s]

52001


Map:  33%|███▎      | 4000/12296 [05:43<11:36, 11.91 examples/s]

52001


Map:  41%|████      | 5000/12296 [07:59<12:56,  9.40 examples/s]

52001


Map:  49%|████▉     | 6000/12296 [10:22<12:28,  8.41 examples/s]

52001


Map:  57%|█████▋    | 7000/12296 [14:30<14:13,  6.21 examples/s]

52001


Map:  65%|██████▌   | 8000/12296 [17:31<11:59,  5.97 examples/s]

52001


Map:  73%|███████▎  | 9000/12296 [19:44<08:36,  6.38 examples/s]

52001


Map:  81%|████████▏ | 10000/12296 [22:26<06:03,  6.32 examples/s]

52001


Map:  89%|████████▉ | 11000/12296 [25:16<03:29,  6.17 examples/s]

52001


Map:  98%|█████████▊| 12000/12296 [27:57<00:47,  6.18 examples/s]

52001


Map: 100%|██████████| 12296/12296 [28:45<00:00,  7.12 examples/s]
Map:   0%|          | 0/2689 [00:00<?, ? examples/s]

52001


Map:  37%|███▋      | 1000/2689 [01:51<02:42, 10.36 examples/s]

52001


Map:  74%|███████▍  | 2000/2689 [04:42<01:42,  6.70 examples/s]

52001


Map: 100%|██████████| 2689/2689 [06:45<00:00,  6.63 examples/s]
Map:   0%|          | 0/2689 [00:00<?, ? examples/s]

52001


Map:  37%|███▋      | 1000/2689 [01:47<02:41, 10.45 examples/s]

52001


Map:  74%|███████▍  | 2000/2689 [04:41<01:42,  6.73 examples/s]

52001


Map: 100%|██████████| 2689/2689 [06:42<00:00,  6.68 examples/s]


In [73]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3367306
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 756777
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 756777
    })
})

In [74]:
import pickle
with open("tokenized_dataset_roberta.pkl","wb") as f :
    pickle.dump(tokenized_datasets,f)

In [5]:
import pickle
with open("tokenized_dataset_roberta.pkl","rb") as f :
    tokenized_datasets =pickle.load(f)

In [23]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
 tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.30
)


In [None]:
tokenized_datasets['train'][0]

Dataset({
    features: ['input_ids'],
    num_rows: 3367306
})

In [22]:
vocab_size = model.config.vocab_size
print("Taille du vocabulaire:", vocab_size)

max_token_id = max([max(sequence) for sequence in tokenized_datasets['train'] if sequence])

print(max_token_id)
# Vérifier si tous les indices sont valides
assert max_token_id < vocab_size, "Des indices de tokens dépassent la taille du vocabulaire!"

Taille du vocabulaire: 52000


KeyboardInterrupt: 

In [7]:
from transformers import Trainer, TrainingArguments
batch_size=64
training_args = TrainingArguments(
 output_dir='Roberta_swerick',
 resume_from_checkpoint=True,
 report_to=[],
overwrite_output_dir=True,
save_strategy="epoch",
save_total_limit=100,
load_best_model_at_end=True,
evaluation_strategy="epoch",
learning_rate=1e-4,
weight_decay=0.01,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
bf16=True,
no_cuda=True,
logging_dir='./logs', 
num_train_epochs=3
)
model = model.to("cpu")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    #prediction_loss_only=True,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


IndexError: index out of range in self