# Configuration Globale

In [1]:
# imports
import random
import math
import numpy  as np
import pandas as pd
import torch
import torch.nn   as nn

from tqdm         import tqdm
from datasets     import Dataset
from transformers import AutoTokenizer
from transformers import BertConfig, BertForPreTraining
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
# import evaluate (if you intend to use metrics during training -- but it OOMS on manneback)

In [2]:
FIRST_ROUND  = False  # is this the very first time that we want to start training this model ?
SOURCE       = "xaviergillard/parti-pris-v2-f32"
TARGET       = "xaviergillard/parti-pris-v2-f32"
DTYPE        = torch.float32
BATCH_SZ     =     8 # 64 on manneback
TRAIN_EPOCHS =     3 # 92 over the course of 20h on manneback. I performed 3 rounds 
MAX_LENGTH   =  1024
VOCAB_SIZE   = 30522
HIDDEN_SIZE  =   384
NUM_HIDDEN_L =     6
NUM_ATTN_HEAD=     6
INTERMEDIATE =  3072

# Chargement du dataset

In [3]:
if FIRST_ROUND:
    ds = Dataset.from_csv("corpus_partipris_v2.csv")
    ds = ds.remove_columns(["Unnamed: 0", "index"])
    ds = ds.map(lambda r: {'all_texts': r['title']+'\n'+r['full_text']})
    ds

# Creation du Tokenizer

In [4]:
# train tokenizer
if FIRST_ROUND:
    tokenizer = AutoTokenizer.from_pretrained(SOURCE).train_new_from_iterator(iter(ds['all_texts']), vocab_size=VOCAB_SIZE)
    tokenizer.push_to_hub(TARGET)

# Creation du dataset avec les NSP

In [5]:
# Step 1: encode the texts with the tokenizer using a smaller window than that of the model. 
#         this way we will be able to fit 2 chunks next to one another and still fit within
#         the trained model context window.
if FIRST_ROUND:
    tokenize_half_size = {
        'truncation': True,
        'max_length': (MAX_LENGTH-3)//2,
        'return_overflowing_tokens': True,
        'stride': 2
    }
    
    ds   = ds.map(lambda row: {'encoded': tokenizer.encode(row['all_texts'], **tokenize_half_size)})

In [6]:
# Step 2: perform a pass over the encoded texts and create 1 datapoint for each chunk of data.
#         for each of these, we flip a coin (proba 1/2) to decide whether or not the 2nd half
#         of the context window will comprise the next sentence.
if FIRST_ROUND:
    data = ds['encoded']
    N    = len(data)
    info = []
    for i,text in tqdm(enumerate(data)):
        for j,chunk in enumerate(text):
            nsp_label = j < len(chunk) - 1 and bool(random.getrandbits(1))
            other= ""
            if nsp_label: 
                other = chunk[j+1]
            else:
                next  = int(math.floor(random.random() * N))
                other = data[next][0]
            that = tokenizer.decode(other)
            a = tokenizer.decode(chunk, skip_special_tokens=True)
            b = tokenizer.decode(other, skip_special_tokens=True)
            record = tokenizer(a, b, max_length=MAX_LENGTH, truncation=False, padding=False)
            record['next_sentence_label'] = 0 if nsp_label else 1
            info.append(record)
    
    data = pd.DataFrame(info)
    data.to_parquet('partipris_pretraining_full.parquet')
    data

# Entrainement du Modele a proprement parler

In [8]:
# initialize the model
if FIRST_ROUND:
    data      = data
    tokenizer = tokenizer
    config    = BertConfig(
        vocab_size              = VOCAB_SIZE,
        hidden_size             = HIDDEN_SIZE,
        num_hidden_layers       = NUM_HIDDEN_L,
        num_attention_heads     = NUM_ATTN_HEAD,
        intermediate_size       = INTERMEDIATE,
        max_position_embeddings = MAX_LENGTH)
    model     = BertForPreTraining(config)
else:    
    data      = Dataset.from_pandas(pd.read_parquet('partipris_pretraining_full.parquet'))
    tokenizer = AutoTokenizer.from_pretrained(TARGET)
    model     = BertForPreTraining.from_pretrained(SOURCE, torch_dtype=DTYPE) 

# split test train
data = data.train_test_split(train_size=0.95, shuffle=True)

#########################################################################################
# metrics
#########################################################################################
# metrics are disabled because unfortunately, it causes cuda OOM on manneback
#########################################################################################
#loss     = nn.CrossEntropyLoss()
#accuracy = evaluate.load("accuracy")
#def ignoring_dummy(preds, labels, dummy=-100):
#    yhat = []
#    y    = []
#    labels = labels.reshape((-1,))
#    preds  = preds.reshape((labels.shape[0], -1))    
#    for i,label in enumerate(labels):
#        if label == dummy:
#            continue
#        else:
#            y.append(label)
#            yhat.append(preds[i].argmax())
#    yhat = np.array(yhat)
#    y    = np.array(y)
#    return (yhat, y)
#    
#def compute_metrics(eval):
#    y_mlm, y_nsp = eval.label_ids
#    h_mlm, h_nsp = eval.predictions
#    #
#    y_mlm = torch.tensor(y_mlm.reshape((-1,))).to('cpu')
#    h_mlm = torch.tensor(h_mlm.reshape((y_mlm.shape[0], -1))).to('cpu')
#    l_mlm = loss(h_mlm, y_mlm)
#    
#    y_nsp = torch.tensor(y_nsp.reshape((-1,))).to('cpu')
#    h_nsp = torch.tensor(h_nsp.reshape((y_nsp.shape[0], -1))).to('cpu')
#    l_nsp = loss(h_nsp, y_nsp)
#    #
#    h_mlm, y_mlm = ignoring_dummy(h_mlm, y_mlm, dummy=-100)
#    a_mlm = accuracy.compute(predictions=h_mlm, references=y_mlm)
#    a_nsp = accuracy.compute(predictions=h_nsp.argmax(axis=-1), references=y_nsp)
#    #
#    return {
#        'mlm_accuracy': a_mlm['accuracy'], 
#        'nsp_accuracy': a_nsp['accuracy'], 
#        'mlm_loss': l_mlm, 
#        'nsp_loss': l_nsp, 
#        'tot_loss': l_mlm + l_nsp 
#    }
#
#########################################################################################

# training
collator  = DataCollatorForLanguageModeling(tokenizer=tokenizer)
args      = TrainingArguments(
    num_train_epochs            = TRAIN_EPOCHS,
    per_device_train_batch_size = BATCH_SZ,
    #
    output_dir                  = './checkpoints', 
    overwrite_output_dir        = True,
    save_strategy               = "epoch", 
    save_total_limit            = 2,
    #
    eval_strategy               = "epoch",
    #eval_steps                  = 1,
    #eval_accumulation_steps     = 1,
    #torch_empty_cache_steps     = 1,
    #
    gradient_accumulation_steps = 2, # batch de 64 etait ok
    bf16                        = (DTYPE == torch.bfloat16),
    #
    push_to_hub                 = True,
    hub_model_id                = TARGET,
    hub_strategy                = "every_save",
    hub_token                   = "USE_YOUR_OWN")

trainer = Trainer(
    model           = model,
    tokenizer       = tokenizer,
    train_dataset   = data['train'], 
    eval_dataset    = data['test'],
    args            = args,
    data_collator   = collator,
    #compute_metrics = compute_metrics
) 

trainer.train()

# the end
model.push_to_hub(TARGET)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 