In [1]:
%%capture
%pip install -U neptune
# %pip install -U transformers

In [2]:
from transformers import AutoTokenizer, GPT2Config, GPT2Model, AutoConfig,\
        AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,Gemma2Config
from datasets import load_dataset,Dataset
import numpy as np
import torch
from transformers import DataCollatorWithPadding,Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [3]:
from huggingface_hub import notebook_login
from kaggle_secrets import UserSecretsClient

neptune_api = UserSecretsClient().get_secret("NEPTUNE_API_TOKEN")
neptune_project =UserSecretsClient().get_secret("NEPTUNE_PROJECT")
base_model_id='tirthadagr8/Japanese_to_english_gpt2CasualLM_GemmaTokenizer'
# notebook_login(hf_token)

In [4]:
tokenizer=AutoTokenizer.from_pretrained(base_model_id,padding_side='right')
n_head=32
n_layer=24
n_embd=1024
config_kwargs = {"vocab_size": len(tokenizer),
                 "scale_attn_by_layer_idx": True,
                 "bos_token_id":tokenizer.bos_token_id,
                 "eos_token_id":tokenizer.eos_token_id,
                 "pad_token_id":tokenizer.pad_token_id,
                 "reorder_and_upcast_attn": True,
                 # "n_head":n_head,
                 # "n_layer":n_layer,
                 # "n_embd":n_embd,
                 }

# Load model with config and push to hub
config = AutoConfig.from_pretrained('gpt2', **config_kwargs)
model = AutoModelForCausalLM.from_config(config)
model = AutoModelForCausalLM.from_pretrained(base_model_id)
# model.save_pretrained('./model')
# model.cuda()

tokenizer_config.json:   0%|          | 0.00/46.3k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/988 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [5]:
eng_text=[]
jp_text=[]
with open('/kaggle/input/japanese-english-subtitle-corpus/split/train','r') as f:
    for line in f:
        txt=(line.strip().split('\t'))
        eng_text.append(txt[0])
        jp_text.append(txt[1])
dataset={
    'src':jp_text,
    'trg':eng_text
}

In [6]:
raw_datasets=Dataset.from_dict(dataset).shuffle(np.random.randint(1,10000))

In [7]:
print(raw_datasets[100]['src'],raw_datasets[100]['trg'])

パワーを取り戻すためには...。 in order to regain one's power...


In [8]:
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
    def __init__(
        self, tokenizer, dataset, infinite=False, seq_length=320, num_of_sequences=1024, chars_per_token=5.5
    ):
        self.tokenizer = tokenizer
        self.tokenizer.add_eos_token=True
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
        self.epoch = 0
        self.infinite = infinite
        self.prompt=f"Translate the following Japanese sentence to English:\n\nJapanese:"
    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    cur_data=next(iterator)
                    buffer.append(self.prompt+cur_data["src"]+"\nEnglish:"+cur_data["trg"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                        self.epoch += 1
                        logger.info(f"Dataset epoch: {self.epoch}")
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input)# + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield {'input_ids':torch.tensor(input_ids),'labels':torch.tensor(input_ids)}

In [9]:
train_dataset = ConstantLengthDataset(
        tokenizer, raw_datasets, infinite=True
    )
it=iter(train_dataset)

In [10]:
x=next(it)['input_ids']
print(len(x))
tokenizer.decode(x)

320


"<bos>Translate the following Japanese sentence to English:\n\nJapanese:これは...\nEnglish:what the...<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:あくまで可能性を 言ったつもりで・・・\nEnglish:and thanks to stacy that dream's about to come true.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:そのコストがとても高いという事を述べています\nEnglish:is very little, at a very high cost.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:友喜! どうしたの? 消火器なら\nEnglish:tomoki! what happened? if you use a fire extinguisher<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:5分あげます\nEnglish:you get five minutes.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:スコットにだ\nEnglish:it's for scott.<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:私はガジェットに注目している1人です\nEnglish:and i'm one of many people who believes that<eos><bos>Translate the following Japanese sentence to English:\n\nJapanese:シ

In [11]:
# from transformers import DataCollatorForSeq2Seq
# class TranslationDataCollator(DataCollatorForSeq2Seq):
#     def __init__(self, tokenizer, max_length=128):
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __call__(self, features):
#         inputs = []
#         labels = []
        
#         for example in features:
#             # Get the Japanese (src) and English (trg) text
#             src_text = example["src"]
#             trg_text = example["trg"]

#             # Format prompt for translation
#             prompt = f"Translate the following Japanese sentence to English:\n\nJapanese:{src_text}"
#             len_of_chat_template=64
#             # Tokenize prompt and target text
#             prompt_tokens = self.tokenizer(prompt, truncation=True, max_length=self.max_length+len_of_chat_template, add_special_tokens=False)["input_ids"]
#             trg_tokens = self.tokenizer(trg_text, truncation=True, max_length=self.max_length, add_special_tokens=False)["input_ids"]
            
#             # Concatenate tokens with BOS and EOS
#             input_ids = [self.tokenizer.bos_token_id] + prompt_tokens + self.tokenizer.encode("\nEnglish:",add_special_tokens=False) + trg_tokens + [self.tokenizer.eos_token_id]
#             label_ids = input_ids.copy()  # Autoregressive model needs labels to match input
            
#             inputs.append(input_ids)
#             labels.append(label_ids)

#         # Pad sequences in the batch
#         inputs = self.tokenizer.pad({"input_ids": inputs}, padding=True, return_tensors="pt")["input_ids"]
#         labels = self.tokenizer.pad({"input_ids": labels}, padding=True, return_tensors="pt")["input_ids"]

#         # print(self.tokenizer.batch_decode(inputs))
        
#         return {
#             "input_ids": inputs,
#             "labels": labels,
#         }

# # Create an instance of the data collator
# data_collator = TranslationDataCollator(tokenizer=tokenizer)

In [18]:
training_args = TrainingArguments(
    output_dir="./results",
#     eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    logging_steps=1,
    lr_scheduler_type='cosine',
    warmup_steps=200,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    max_steps=45000,
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=1000,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to='none'
)

In [19]:
import neptune
from transformers.integrations import NeptuneCallback
neptune_callback = NeptuneCallback(
    project=neptune_project,
    api_token=neptune_api
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Raw dataset without tokenized inputs
#     data_collator=data_collator,
    tokenizer=tokenizer,  # Required for Seq2SeqTrainer to handle text generation and decoding,
    callbacks=[neptune_callback]
)

max_steps is given, it will override any value given in num_train_epochs


In [21]:
trainer.train()

Step,Training Loss
1,3.5379
2,3.6249
3,3.3536
4,3.4576
5,3.3625
6,3.4706
7,3.4332
8,3.4612
9,3.4402
10,3.1884


KeyboardInterrupt: 

In [22]:
src_text='あなたとは遊びたくない'
print(tokenizer.batch_decode(model.generate(tokenizer.encode(f"Translate the following Japanese sentence to English:\n\nJapanese:{src_text}\nEnglish:",return_tensors='pt')[:,:-1].cuda(),max_length=128))[0])

<bos>Translate the following Japanese sentence to English:

Japanese:あなたとは遊びたくない
English:i don't want to play with you.<eos>
