In [None]:
%%capture
%pip install -U neptune

In [None]:
from transformers import AutoTokenizer, GPT2Config, GPT2Model, AutoConfig,\
        AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,Gemma2Config
from datasets import load_dataset,Dataset
from transformers import DataCollatorWithPadding,Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [None]:
from huggingface_hub import notebook_login
from kaggle_secrets import UserSecretsClient

neptune_api = UserSecretsClient().get_secret("NEPTUNE_API_TOKEN")
neptune_project =UserSecretsClient().get_secret("NEPTUNE_PROJECT")
# notebook_login(hf_token)

In [None]:
tokenizer=AutoTokenizer.from_pretrained('/kaggle/input/gemma-2-2b-jpn-it/transformers/gemma-2-2b-jpn-it/1')
n_head=32
n_layer=24
n_embd=1024
config_kwargs = {"vocab_size": len(tokenizer),
                 "scale_attn_by_layer_idx": True,
                 "bos_token_id":tokenizer.bos_token_id,
                 "eos_token_id":tokenizer.eos_token_id,
                 "reorder_and_upcast_attn": True,
                 # "n_head":n_head,
                 # "n_layer":n_layer,
                 # "n_embd":n_embd,
                 }

# Load model with config and push to hub
config = AutoConfig.from_pretrained('gpt2', **config_kwargs)
model = AutoModelForCausalLM.from_config(config)
# model = AutoModelForCausalLM.from_pretrained('/kaggle/input/customgpt2/transformers/default/1/results/checkpoint-26512')
# model.save_pretrained('./model')
# model.cuda()

In [None]:
eng_text=[]
jp_text=[]
with open('/kaggle/input/japanese-english-subtitle-corpus/split/train','r') as f:
    for line in f:
        txt=(line.strip().split('\t'))
        eng_text.append(txt[0])
        jp_text.append(txt[1])
dataset={
    'src':jp_text,
    'trg':eng_text
}

In [None]:
raw_datasets=Dataset.from_dict(dataset)

In [None]:
from transformers import DataCollatorForSeq2Seq
class TranslationDataCollator(DataCollatorForSeq2Seq):
    def __init__(self, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, features):
        inputs = []
        labels = []
        
        for example in features:
            # Get the Japanese (src) and English (trg) text
            src_text = example["src"]
            trg_text = example["trg"]

            # Format prompt for translation
            prompt = f"Translate the following Japanese sentence to English:\n\nJapanese: {src_text}"
            
            # Tokenize prompt and target text
            prompt_tokens = self.tokenizer(prompt, truncation=True, max_length=self.max_length, add_special_tokens=False)["input_ids"]
            trg_tokens = self.tokenizer(trg_text, truncation=True, max_length=self.max_length, add_special_tokens=False)["input_ids"]
            
            # Concatenate tokens with BOS and EOS
            input_ids = [self.tokenizer.bos_token_id] + prompt_tokens + self.tokenizer.encode("\nEnglish:") + trg_tokens + [self.tokenizer.eos_token_id]
            label_ids = input_ids.copy()  # Autoregressive model needs labels to match input
            
            inputs.append(input_ids)
            labels.append(label_ids)

        # Pad sequences in the batch
        inputs = self.tokenizer.pad({"input_ids": inputs}, padding=True, return_tensors="pt")["input_ids"]
        labels = self.tokenizer.pad({"input_ids": labels}, padding=True, return_tensors="pt")["input_ids"]

        return {
            "input_ids": inputs,
            "labels": labels,
        }

# Create an instance of the data collator
data_collator = TranslationDataCollator(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
#     eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=16,  # Adjust based on GPU memory
    logging_steps=1,
    lr_scheduler_type='cosine',
    warmup_steps=250,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    max_steps=65000,
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=1000,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to='none'
)

In [None]:
import neptune
from transformers.integrations import NeptuneCallback
neptune_callback = NeptuneCallback(
    project=neptune_project,
    api_token=neptune_api
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_datasets,  # Raw dataset without tokenized inputs
    data_collator=data_collator,
    tokenizer=tokenizer,  # Required for Seq2SeqTrainer to handle text generation and decoding,
    callbacks=[neptune_callback]
)

In [None]:
trainer.train()

In [None]:
src_text='素人の気づき 「いい、アーニャ。今から行ったとしても、陛下が実際'
print(tokenizer.batch_decode(model.generate(tokenizer.encode(f"Translate the following Japanese sentence to English:\n\nJapanese: {src_text}\nEnglish:",return_tensors='pt').cuda(),max_length=128))[0])