# Welcome to Hanasu Encoder Trainer

In [None]:
!pip install datasets

# Load the model

In [None]:
import os
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset

In [None]:
# Specify your model checkpoint
model_checkpoint = "yukiarimo/yuna-ai-hanasu-v1"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load your dataset

In [None]:
# raw text file
dataset = load_dataset("text", data_files={"train": "/content/drive/MyDrive/dataset.txt"})

# We define a block size for training (adjust as needed; note that mDeBERTa was pre-trained with a fixed max length)
block_size = 512

# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize each example and truncate to block_size.
    return tokenizer(examples["text"], truncation=True, max_length=block_size)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Optionally, to create contiguous blocks over the entire text (instead of individual lines), we can concatenate and re-split:
def group_texts(examples):
    # Concatenate all token lists.
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated[list(examples.keys())[0]])
    # Drop the small remainder
    total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    return result

# Group tokens into blocks for efficient training.
lm_dataset = tokenized_dataset.map(group_texts, batched=True)

# Create a data collator that will dynamically mask tokens for the MLM objective.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# This is where magic happens

In [None]:
# Define training arguments.
training_args = TrainingArguments(
    output_dir="./mdeberta-finetuned-light-novels",
    overwrite_output_dir=True,
    num_train_epochs=10,                    # Adjust epochs as needed
    per_device_train_batch_size=4,         # Adjust batch size based on your GPU memory. 8 = 16GB GPU
    save_steps=1,
    learning_rate=1e-5,                     # Adjust learning rate as needed
    weight_decay=0.01,
    fp16=True,                              # Use mixed precision training
    save_strategy="epoch",
    prediction_loss_only=True,             # Only compute the loss for the masked language model. Thhs is important for efficiency.
    logging_steps=1,
    logging_dir="./logs",
    report_to="tensorboard",
)

# Initialize the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    data_collator=data_collator,
)

# Start fine-tuning.
trainer.train()

# Save in safetensors

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

checkpoint_path = "/content/checkpoint"
final_model_path = "/content/hanasu-v1"

# Load the model from the checkpoint
model = AutoModelForMaskedLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Save as the final model
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Final model saved to {final_model_path}")

# Inference

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_path = "/content/hanasu-v1"  # Path to your saved model
model = AutoModelForMaskedLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
import torch

input_ids = tokenizer.encode("こんにちは、[MASK]元気ですか？", return_tensors="pt")

with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits  # Get the model's predictions

masked_index = torch.where(input_ids == tokenizer.mask_token_id)[1]  # Find the positions of the masked tokens
predicted_token_ids = torch.argmax(predictions[0, masked_index], dim=-1)  # Get the predicted token IDs for masked positions
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)  # Convert IDs back to tokens

# reconstruct the sentence
reconstructed_sentence = tokenizer.decode(input_ids[0])
for token in predicted_tokens:
    reconstructed_sentence = reconstructed_sentence.replace(tokenizer.mask_token, token, 1)

print(reconstructed_sentence)