In [1]:
import random
import math
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

tokenizer = AutoTokenizer.from_pretrained("xaviergillard/parti-pris-v2")

data = pd.read_csv("corpus_partipris_v2.csv")\
            .apply(lambda row: f"{row['title']}\n{row['full_text']}", axis=1)\
            .map(lambda x: tokenizer.encode(x, truncation=True, stride=2, max_length=(1024-3)//2, return_overflowing_tokens=True))

N    = len(data)
info = []
for i,text in enumerate(data):
    for j,chunk in enumerate(text):
        nsp_label = j < len(chunk) - 1 and bool(random.getrandbits(1))
        other= ""
        if nsp_label: 
            other = chunk[j+1]
        else:
            next  = int(math.floor(random.random() * N))
            other = data[next][0]
        that = tokenizer.decode(other)
        a = tokenizer.decode(chunk, skip_special_tokens=True)
        b = tokenizer.decode(other, skip_special_tokens=True)
        record = tokenizer(a, b, max_length=1024, truncation=False, padding=False)
        record['next_sentence_label'] = 0 if nsp_label else 1
        info.append(record)

data = pd.DataFrame(info)
data.to_parquet('partipris_pretraining_full.parquet')

In [2]:
# train tokenizer
from transformers import AutoTokenizer

if False: 
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased').train_new_from_iterator(iter(data['all_texts']), vocab_size=30522)
    tokenizer.push_to_hub("xaviergillard/parti-pris-v2")
else: 
    tokenizer = AutoTokenizer.from_pretrained("xaviergillard/parti-pris-v2")



In [2]:
df = pd.read_parquet('partipris_pretraining_full.parquet')
df['input_ids'].map(lambda x: len(x)).describe()

count    32334.000000
mean       606.377961
std        285.884073
min          7.000000
25%        454.000000
50%        512.000000
75%        877.000000
max       1023.000000
Name: input_ids, dtype: float64

In [13]:
import numpy  as np
import pandas as pd
import torch
import torch.nn   as nn
from transformers import AutoTokenizer
from transformers import BertConfig, BertForPreTraining
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets     import Dataset
import evaluate

# model
data      = Dataset.from_pandas(pd.read_parquet('partipris_pretraining_full.parquet')[:100])
data      = data.train_test_split(train_size=0.8, shuffle=True)
tokenizer = AutoTokenizer.from_pretrained("xaviergillard/parti-pris-v2")
#config    = BertConfig(
#    hidden_size             =  384, # 768
#    num_hidden_layers       =    6, # 12
#    num_attention_heads     =    6, # 12
#    max_position_embeddings = 1024, # 512
#    is_decoder              = False)
model     = BertForPreTraining.from_pretrained("xaviergillard/parti-pris-v2", torch_dtype=torch.bfloat16) 

# metrics
loss     = nn.CrossEntropyLoss()
accuracy = evaluate.load("accuracy")
def ignoring_dummy(preds, labels, dummy=-100):
    yhat = []
    y    = []
    labels = labels.reshape((-1,))
    preds  = preds.reshape((labels.shape[0], -1))    
    for i,label in enumerate(labels):
        if label == dummy:
            continue
        else:
            y.append(label)
            yhat.append(preds[i].argmax())
    yhat = np.array(yhat)
    y    = np.array(y)
    return (yhat, y)
    
def compute_metrics(eval):
    y_mlm, y_nsp = eval.label_ids
    h_mlm, h_nsp = eval.predictions
    #
    y_mlm = torch.tensor(y_mlm.reshape((-1,)))
    h_mlm = torch.tensor(h_mlm.reshape((y_mlm.shape[0], -1)))
    l_mlm = loss(h_mlm, y_mlm)
    
    y_nsp = torch.tensor(y_nsp.reshape((-1,)))
    h_nsp = torch.tensor(h_nsp.reshape((y_nsp.shape[0], -1)))
    l_nsp = loss(h_nsp, y_nsp)
    #
    h_mlm, y_mlm = ignoring_dummy(h_mlm, y_mlm, dummy=-100)
    a_mlm = accuracy.compute(predictions=h_mlm, references=y_mlm)
    a_nsp = accuracy.compute(predictions=h_nsp.argmax(axis=-1), references=y_nsp)
    #
    return {
        'mlm_accuracy': a_mlm['accuracy'], 
        'nsp_accuracy': a_nsp['accuracy'], 
        'mlm_loss': l_mlm, 
        'nsp_loss': l_nsp, 
        'tot_loss': l_mlm + l_nsp 
    }

# training
BATCH_SZ  = 8
collator  = DataCollatorForLanguageModeling(tokenizer=tokenizer)
args      = TrainingArguments(
    num_train_epochs            = 3,
    per_device_train_batch_size = BATCH_SZ,
    #
    output_dir                  = './checkpoints', 
    overwrite_output_dir        = True,
    save_strategy               = "epoch", 
    save_total_limit            = 2,
    #
    eval_strategy               = "epoch",
    #
    gradient_accumulation_steps = 100,
    bf16                        = True)

trainer = Trainer(
    model           = model,
    tokenizer       = tokenizer,
    train_dataset   = data['train'], 
    eval_dataset    = data['test'],
    args            = args,
    data_collator   = collator,
    compute_metrics = compute_metrics
)    

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mlm Accuracy,Nsp Accuracy,Mlm Loss,Nsp Loss,Tot Loss,Runtime,Samples Per Second,Steps Per Second
1,No log,6.478864,0.091077,1.0,6.467177,0.0002,6.467377,4.972,4.023,0.603
2,No log,6.544699,0.086753,1.0,6.563579,0.00019,6.563769,4.3325,4.616,0.692
3,No log,6.511369,0.095406,1.0,6.484383,0.000189,6.484572,4.2427,4.714,0.707


TrainOutput(global_step=3, training_loss=0.6485934654871622, metrics={'train_runtime': 30.3544, 'train_samples_per_second': 7.907, 'train_steps_per_second': 0.099, 'total_flos': 25220884782720.0, 'train_loss': 0.6485934654871622, 'epoch': 3.0})