In [1]:
%%capture
%pip install -U neptune
# %pip install -U transformers

In [2]:
from transformers import AutoTokenizer, GPT2Config, GPT2Model, AutoConfig,\
        AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,Gemma2Config
from datasets import load_dataset,Dataset
import numpy as np
import torch
from transformers import DataCollatorWithPadding,Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [3]:
from huggingface_hub import notebook_login
from kaggle_secrets import UserSecretsClient

neptune_api = UserSecretsClient().get_secret("NEPTUNE_API_TOKEN")
neptune_project =UserSecretsClient().get_secret("NEPTUNE_PROJECT")
# notebook_login(hf_token)

In [4]:
tokenizer=AutoTokenizer.from_pretrained('/kaggle/input/customgpt2/transformers/default/1/results/checkpoint-80000',padding_side='right')
n_head=32
n_layer=24
n_embd=1024
config_kwargs = {"vocab_size": len(tokenizer),
                 "scale_attn_by_layer_idx": True,
                 "bos_token_id":tokenizer.bos_token_id,
                 "eos_token_id":tokenizer.eos_token_id,
                 "pad_token_id":tokenizer.pad_token_id,
                 "reorder_and_upcast_attn": True,
                 # "n_head":n_head,
                 # "n_layer":n_layer,
                 # "n_embd":n_embd,
                 }

# Load model with config and push to hub
config = AutoConfig.from_pretrained('gpt2', **config_kwargs)
model = AutoModelForCausalLM.from_config(config)
model = AutoModelForCausalLM.from_pretrained('/kaggle/input/customgpt2/transformers/default/1/results/checkpoint-80000')
# model.save_pretrained('./model')
# model.cuda()

In [5]:
eng_text=[]
jp_text=[]
with open('/kaggle/input/japanese-english-subtitle-corpus/split/train','r') as f:
    for line in f:
        txt=(line.strip().split('\t'))
        eng_text.append(txt[0])
        jp_text.append(txt[1])
dataset={
    'src':jp_text,
    'trg':eng_text
}

In [6]:
raw_datasets=Dataset.from_dict(dataset).shuffle(np.random.randint(1,10000))

In [7]:
print(raw_datasets[100]['src'],raw_datasets[100]['trg'])

確かに過去の備えでは無かったが 我々は武器を持っている now, this city may not have the manpower it once did... but it has the firepower.


In [8]:
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
    def __init__(
        self, tokenizer, dataset, infinite=False, seq_length=320, num_of_sequences=1024, chars_per_token=5.5
    ):
        self.tokenizer = tokenizer
        self.tokenizer.add_eos_token=True
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
        self.epoch = 0
        self.infinite = infinite
        self.prompt=f"Translate the following Japanese sentence to English:\n\nJapanese:"
    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    cur_data=next(iterator)
                    buffer.append(self.prompt+cur_data["src"]+"\nEnglish:"+cur_data["trg"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                        self.epoch += 1
                        logger.info(f"Dataset epoch: {self.epoch}")
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input)# + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield {'input_ids':torch.tensor(input_ids),'labels':torch.tensor(input_ids)}

In [9]:
train_dataset = ConstantLengthDataset(
        tokenizer, raw_datasets, infinite=True
    )
it=iter(train_dataset)

In [10]:
x=next(it)['input_ids']
print(len(x))
tokenizer.decode(x)

320


"<bos>Translate the following Japanese sentence to English:\n\nJapanese:また ヒゲの部分には\nEnglish:it's also got a little chin barbel here<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:新鮮な空気を\nEnglish:sme fresh air.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:ルーカス・デサンジュだ\nEnglish:i'm family. lucas desange.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:性的暴行は?\nEnglish:any sexual violence?<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:しもた! 何やっとると?\nEnglish:what are you doing? i'm coming in!<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:スイーツがあなたの昇進前の 最後の事件になるって言ってた すごいところを見せてよ\nEnglish:sweets would say that since this may be your last case before being promoted, you need to prove that you're still the best.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:e・ハーモニーのトッド・マハーです ご用件は?\nEnglish:hi! todd mahar, eharmony. how can i he

In [11]:
# from transformers import DataCollatorForSeq2Seq
# class TranslationDataCollator(DataCollatorForSeq2Seq):
#     def __init__(self, tokenizer, max_length=128):
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __call__(self, features):
#         inputs = []
#         labels = []
        
#         for example in features:
#             # Get the Japanese (src) and English (trg) text
#             src_text = example["src"]
#             trg_text = example["trg"]

#             # Format prompt for translation
#             prompt = f"Translate the following Japanese sentence to English:\n\nJapanese:{src_text}"
#             len_of_chat_template=64
#             # Tokenize prompt and target text
#             prompt_tokens = self.tokenizer(prompt, truncation=True, max_length=self.max_length+len_of_chat_template, add_special_tokens=False)["input_ids"]
#             trg_tokens = self.tokenizer(trg_text, truncation=True, max_length=self.max_length, add_special_tokens=False)["input_ids"]
            
#             # Concatenate tokens with BOS and EOS
#             input_ids = [self.tokenizer.bos_token_id] + prompt_tokens + self.tokenizer.encode("\nEnglish:",add_special_tokens=False) + trg_tokens + [self.tokenizer.eos_token_id]
#             label_ids = input_ids.copy()  # Autoregressive model needs labels to match input
            
#             inputs.append(input_ids)
#             labels.append(label_ids)

#         # Pad sequences in the batch
#         inputs = self.tokenizer.pad({"input_ids": inputs}, padding=True, return_tensors="pt")["input_ids"]
#         labels = self.tokenizer.pad({"input_ids": labels}, padding=True, return_tensors="pt")["input_ids"]

#         # print(self.tokenizer.batch_decode(inputs))
        
#         return {
#             "input_ids": inputs,
#             "labels": labels,
#         }

# # Create an instance of the data collator
# data_collator = TranslationDataCollator(tokenizer=tokenizer)

In [34]:
training_args = TrainingArguments(
    output_dir="./results",
#     eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    logging_steps=20,
    lr_scheduler_type='cosine',
    warmup_steps=200,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    max_steps=45000,
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=1000,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to='none'
)

In [35]:
import neptune
from transformers.integrations import NeptuneCallback
neptune_callback = NeptuneCallback(
    project=neptune_project,
    api_token=neptune_api
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Raw dataset without tokenized inputs
#     data_collator=data_collator,
    tokenizer=tokenizer,  # Required for Seq2SeqTrainer to handle text generation and decoding,
    callbacks=[neptune_callback]
)

max_steps is given, it will override any value given in num_train_epochs


In [37]:
trainer.train()

Step,Training Loss
20,2.4445
40,2.4343
60,2.4276
80,2.3814
100,2.3196
120,2.5682
140,2.546
160,2.5662
180,2.518
200,2.5329


KeyboardInterrupt: 

In [49]:
src_text='ないんたけとな。吠えるな、園山カンナっていや、'
print(tokenizer.batch_decode(model.generate(tokenizer.encode(f"Translate the following Japanese sentence to English:\n\nJapanese:{src_text}\nEnglish:",return_tensors='pt')[:,:-1].cuda(),max_length=128))[0])

<bos>Translate the following Japanese sentence to English:

Japanese:ないんたけとな。吠えるな、園山カンナっていや、
English:no, no, no, no, no, no, no, no, no, no, no, no.<eos>
