In [1]:
from datasets import Dataset
import pandas as pd

# Load your comments
czech_comments = pd.read_parquet("data/processed/czech_media_comments.parquet")

# Create Huggingface Dataset
czech_dataset = Dataset.from_pandas(czech_comments[['text']])

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = czech_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Masking function for MLM
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

Map:   0%|          | 0/845764 [00:00<?, ? examples/s]

In [3]:
from adapters import AutoAdapterModel

model = AutoAdapterModel.from_pretrained("distilbert-base-multilingual-cased")

# Add a new adapter
adapter_name = "czech_comments_mlm"
model.add_adapter(adapter_name, config="pfeiffer")
model.train_adapter(adapter_name)
model.set_active_adapters(adapter_name)

In [4]:
import random

# -----------------------------
# CONFIGURE HOW MUCH TO KEEP
# -----------------------------
TARGET_ROWS = 120_000          # keep ~14 % of 850 000
SEED        = 42               # for reproducibility
# -----------------------------

# 1-a  shuffle the indices, draw the first `TARGET_ROWS`
all_idx   = list(range(len(tokenized_dataset)))
random.Random(SEED).shuffle(all_idx)
sub_idx   = all_idx[:TARGET_ROWS]

# 1-b  build the reduced dataset
medium_dataset = tokenized_dataset.select(sub_idx)

print(f"medium_dataset has {len(medium_dataset):,} rows")

medium_dataset has 120,000 rows


In [5]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
import random

# ---------------------------------------------------------------------
# 1.  Sample 120 000 lines WITHOUT shuffling the whole 850 000-row table
# ---------------------------------------------------------------------
SAMPLE_ROWS = 120_000
SEED        = 42

rng      = random.Random(SEED)
indices  = rng.sample(range(len(tokenized_dataset)), SAMPLE_ROWS)  # light-weight
mini_ds  = tokenized_dataset.select(indices)

print(f"mini_ds has {mini_ds.num_rows:,} rows")    # sanity-check

mini_ds has 120,000 rows


In [6]:
# ---------------------------------------------------------------------
# 2.  Truncate each sequence to 128 tokens (do it in one process)
# ---------------------------------------------------------------------
def truncate(batch):
    for field in ("input_ids", "attention_mask", "token_type_ids"):
        if field in batch:
            batch[field] = [seq[:128] for seq in batch[field]]
    return batch

mini_ds = mini_ds.map(
    truncate,
    batched=True,
    batch_size=1_000,            # small batches → modest RAM
    num_proc=1,                  # keep it single-process
    load_from_cache_file=False,  # write a fresh Arrow file once
)

# ---------------------------------------------------------------------
# 3.  Everything else is just your original Trainer code
# ---------------------------------------------------------------------
data_collator  = DataCollatorForLanguageModeling(
    tokenizer, mlm_probability=0.15
)


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [7]:
from transformers import Trainer, TrainingArguments
# ── 3.  build the trainer exactly as before, but safer defaults ───────────────
training_args = TrainingArguments(
    output_dir="./adapter_mlm_output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,       # fits even a 6–8 GB GPU
    gradient_accumulation_steps=2,       # → effective batch 16
    fp16=True,                           # halves GPU memory
    learning_rate=5e-4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="no",
    logging_dir="./logs",
    logging_steps=500,
    report_to="none",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mini_ds,
    data_collator=data_collator,
)

trainer.train()



Step,Training Loss
500,6.8607
1000,6.4933
1500,6.322
2000,6.1589
2500,6.0849
3000,5.9922
3500,5.8859
4000,5.7795
4500,5.7287
5000,5.7398


TrainOutput(global_step=37500, training_loss=5.229800703125, metrics={'train_runtime': 4009.2328, 'train_samples_per_second': 149.655, 'train_steps_per_second': 9.353, 'total_flos': 2.01312953856e+16, 'train_loss': 5.229800703125, 'epoch': 5.0})

In [7]:
import json, os

checkpoint_path = "adapter_mlm_output/checkpoint-37500"
with open(os.path.join(checkpoint_path, "trainer_state.json")) as f:
    checkpoint_data = json.load(f)
    
# Get training state info
global_step = checkpoint_data["global_step"]
epoch = checkpoint_data["epoch"] 
best_model_checkpoint = checkpoint_data["best_model_checkpoint"]
log_history = checkpoint_data["log_history"]

print(f"Checkpoint info:")
print(f"Global step: {global_step}")
print(f"Epoch: {epoch:.2f}")
print(f"Best model checkpoint: {best_model_checkpoint}")
print(f"Last training loss: {log_history[-1]['loss']:.3f}")

Checkpoint info:
Global step: 37500
Epoch: 5.00
Best model checkpoint: None
Last training loss: 4.846


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir          = "./adapter_mlm_output",
    learning_rate       = 2e-4,          # e.g. lower LR
    lr_scheduler_type   = "cosine",
    max_steps           = 55000,         # additional updates
    fp16                = True,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,
    evaluation_strategy="no",
    logging_dir="./logs",
    logging_steps=500,
    report_to="none",
    push_to_hub=False,
)

trainer = Trainer(
    model          = model,        # the same model object (or reload it)
    args           = training_args,
    train_dataset  = mini_ds,
    data_collator  = data_collator,
)

# Point to the directory you chose above
trainer.train(resume_from_checkpoint="adapter_mlm_output/checkpoint-37500")

There were missing keys in the checkpoint model loaded: ['distilbert.prompt_tuning.base_model_embeddings.weight', 'heads.default.3.weight'].


Step,Training Loss
38000,4.8114
38500,4.8214
39000,4.8071
39500,4.7865
40000,4.8347
40500,4.8563
41000,4.7981
41500,4.8405
42000,4.7836
42500,4.7873


KeyboardInterrupt: 

: 

In [1]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./adapter_mlm_output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=5e-3,
    save_steps=10_000,
    save_total_limit=2,
    eval_strategy="no",
    logging_dir="./logs",
    logging_steps=500,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=medium_dataset,
    data_collator=data_collator,
)

trainer.train()

NameError: name 'model' is not defined

In [21]:
model.save_adapter("./output/czech_comments_mlm", adapter_name)

In [17]:
# Select a different subset for next training
small_dataset_2 = tokenized_dataset.select(range(10000, 20000))

new_training_args = TrainingArguments(
    output_dir="./adapter_mlm_output",
    overwrite_output_dir=True,
    num_train_epochs=5,  # New total number of epochs
    per_device_train_batch_size=16,
    learning_rate=5e-3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    push_to_hub=False,
    report_to="none"
)

In [18]:
# New Trainer (same model, updated args)
new_trainer = Trainer(
    model=model,  # IMPORTANT: same updated model
    args=new_training_args,
    train_dataset=small_dataset_2,
    data_collator=data_collator,
)

new_trainer.train()

Step,Training Loss
500,2.947
1000,2.8647
1500,2.8372
2000,2.8105
2500,2.7509
3000,2.7052


TrainOutput(global_step=3125, training_loss=2.815876767578125, metrics={'train_runtime': 485.7561, 'train_samples_per_second': 102.932, 'train_steps_per_second': 6.433, 'total_flos': 1677607948800000.0, 'train_loss': 2.815876767578125, 'epoch': 5.0})

In [19]:
small_dataset_3 = tokenized_dataset.select(range(20000, 80000))

In [20]:
# New Trainer (same model, updated args)
new_trainer = Trainer(
    model=model, 
    args=new_training_args,
    train_dataset=small_dataset_3,
    data_collator=data_collator,
)

new_trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
print(f"Vocabulary size: {tokenizer.vocab_size}")

Vocabulary size: 119547
