In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from torch.utils.checkpoint import checkpoint_sequential
from tqdm import tqdm
import time
import os
import math
import sentencepiece as spm
import re
from datasets import load_dataset
import gc

In [2]:
# Step 1: Define hyperparameters
batch_size = 8        # Batch size for training
block_size = 512      # Context length
max_iters_pretrain = 10000  # Reduced for Kaggle P100
max_iters_finetune = 10000   # Iterations for fine-tuning
eval_interval = 500   # Evaluate every 500 steps
learning_rate_pretrain = 3e-4  # Learning rate for pre-training
learning_rate_finetune = 3e-5  # Learning rate for fine-tuning
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200      # Evaluation iterations
n_embd = 768          # Embedding dimension
n_head = 12           # Attention heads
n_layer = 12          # Transformer layers
dropout = 0.2         # Dropout
gradient_accumulation_steps = 2  # For memory efficiency
checkpoint_interval = 5000       # Save checkpoint every 5000 steps

In [3]:
# Cell 3: Load and clean Vietnamese-Book-Corpus
print("Step 1: Loading and cleaning Vietnamese-Book-Corpus...")
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def split_long_sentences(text, max_length=15000):
    sentences = []
    current = ""
    for char in text:
        current += char
        if len(current) >= max_length and char in '.!?':
            sentences.append(current.strip())
            current = ""
    if current:
        sentences.append(current.strip())
    return sentences

start_time = time.time()
ds = load_dataset("tmnam20/Vietnamese-Book-Corpus")
total_chars = sum(len(item) for item in ds['train']['text'])
print(f"Total characters in dataset: {total_chars} (~{total_chars / (1024 * 1024):.2f} MB)")
print(f"Dataset size: {len(ds['train'])} examples")

with open('book_corpus.txt', 'w', encoding='utf-8') as f:
    for item in ds['train']['text']:
        cleaned = clean_text(item)
        if len(cleaned) > 0:
            for sentence in split_long_sentences(cleaned, max_length=15000):
                if len(sentence) > 0:
                    f.write(sentence + '\n')

file_size = os.path.getsize('book_corpus.txt') / (1024 * 1024)
print(f"Corpus file size: {file_size:.2f} MB")
print(f"Corpus loaded and cleaned in {time.time() - start_time:.2f} seconds")

# Train SentencePiece tokenizer
print("Step 2: Training SentencePiece tokenizer...")
spm.SentencePieceTrainer.train(
    input='book_corpus.txt',
    model_prefix='tokenizer',
    vocab_size=16000,
    max_sentence_length=15000
)
sp = spm.SentencePieceProcessor(model_file='tokenizer.model')
encode = lambda s: sp.encode(s, out_type=int)
decode = lambda s: sp.decode(s)
vocab_size = sp.get_piece_size()
print(f"Tokenizer trained with vocab size: {vocab_size}")

Step 1: Loading and cleaning Vietnamese-Book-Corpus...


README.md:   0%|          | 0.00/460 [00:00<?, ?B/s]

(…)-00000-of-00008-434a3967f6c4c0f4.parquet:   0%|          | 0.00/647M [00:00<?, ?B/s]

(…)-00001-of-00008-6fed45ff9717b1da.parquet:   0%|          | 0.00/302M [00:00<?, ?B/s]

(…)-00002-of-00008-14351aa8b7f792d4.parquet:   0%|          | 0.00/345M [00:00<?, ?B/s]

(…)-00003-of-00008-fdb56c534d52ac56.parquet:   0%|          | 0.00/127M [00:00<?, ?B/s]

(…)-00004-of-00008-be47438765872609.parquet:   0%|          | 0.00/116M [00:00<?, ?B/s]

(…)-00005-of-00008-a357bbaf12eaeec8.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00006-of-00008-f4e317464972c061.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

(…)-00007-of-00008-399c5b14967345df.parquet:   0%|          | 0.00/134M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Total characters in dataset: 2871090627 (~2738.09 MB)
Dataset size: 16407 examples
Corpus file size: 3544.48 MB
Corpus loaded and cleaned in 753.17 seconds
Step 2: Training SentencePiece tokenizer...
Tokenizer trained with vocab size: 16000


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: book_corpus.txt
  input_format: 
  model_prefix: tokenizer
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 15000
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
 

In [4]:
# Cell 4: Custom Dataset and DataLoaders with debug checks
class TextDataset(Dataset):
    def __init__(self, file_path, block_size, chunk_size=1024*1024):
        self.file_path = file_path
        self.block_size = block_size
        self.chunk_size = chunk_size
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Dataset file not found: {file_path}")
        self.file_size = os.path.getsize(file_path)
        self.num_chunks = math.ceil(self.file_size / chunk_size)

    def __len__(self):
        return self.file_size // self.block_size

    def __getitem__(self, idx):
        offset = idx * self.block_size
        try:
            with open(self.file_path, 'r', encoding='utf-8', errors='replace') as f:
                f.seek(offset)
                text = f.read(self.block_size + 1)
                if len(text) < self.block_size + 1:
                    text = text + '\0' * (self.block_size + 1 - len(text))
                
                # Tokenize input and target
                x_tokens = encode(text[:self.block_size])
                y_tokens = encode(text[1:self.block_size + 1])
                
                # Pad or truncate to fixed length (block_size)
                x_tokens = x_tokens[:self.block_size] + [0] * (self.block_size - len(x_tokens)) if len(x_tokens) < self.block_size else x_tokens[:self.block_size]
                y_tokens = y_tokens[:self.block_size] + [0] * (self.block_size - len(y_tokens)) if len(y_tokens) < self.block_size else y_tokens[:self.block_size]
                
                # Convert to tensors
                x = torch.tensor(x_tokens, dtype=torch.long)
                y = torch.tensor(y_tokens, dtype=torch.long)
                
                # Debug print to verify lengths
                if len(x) != self.block_size or len(y) != self.block_size:
                    print(f"Warning: x length={len(x)}, y length={len(y)} at idx={idx} in file {self.file_path}")
                
                return x.to(device), y.to(device)
        except Exception as e:
            print(f"Error reading file {self.file_path} at offset {offset}: {e}")
            raise

# Create DataLoaders with debug prints
corpus_path = 'book_corpus.txt'
poems_path = '/kaggle/input/poems-dataset/poems_dataset.txt'

print(f"Checking corpus file: {corpus_path}")
if not os.path.exists(corpus_path):
    raise FileNotFoundError(f"Corpus file not found: {corpus_path}")

print(f"Checking poems file: {poems_path}")
if not os.path.exists(poems_path):
    raise FileNotFoundError(f"Poems file not found: {poems_path}")

train_dataset = TextDataset(corpus_path, block_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print(f"train_loader type: {type(train_loader)}")

poem_dataset = TextDataset(poems_path, block_size)
poem_train_loader = DataLoader(poem_dataset, batch_size=batch_size, shuffle=True)
poem_val_loader = DataLoader(poem_dataset, batch_size=batch_size, shuffle=False)  # Separate validation loader
print(f"poem_train_loader type: {type(poem_train_loader)}")
print(f"poem_val_loader type: {type(poem_val_loader)}")

Checking corpus file: book_corpus.txt
Checking poems file: /kaggle/input/poems-dataset/poems_dataset.txt
train_loader type: <class 'torch.utils.data.dataloader.DataLoader'>
poem_train_loader type: <class 'torch.utils.data.dataloader.DataLoader'>
poem_val_loader type: <class 'torch.utils.data.dataloader.DataLoader'>


In [5]:
# Cell: Clear lingering iter variable
try:
    del iter
    print("Cleared lingering 'iter' variable")
except NameError:
    print("'iter' variable not found, proceeding normally")

'iter' variable not found, proceeding normally


In [6]:
# Cell 5: Model definition with gradient checkpointing
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * n_layer))
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = checkpoint_sequential(self.blocks, segments=2, input=x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Cell 6: Loss estimation function
@torch.no_grad()
def estimate_loss(train_loader, val_loader):
    out = {}
    model.eval()
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        losses = torch.zeros(eval_iters)
        file_path = loader.dataset.file_path
        print(f"Processing {split} split with loader type: {type(loader)}, file: {file_path}")
        iterator = iter(loader)
        for k in range(eval_iters):
            try:
                X, Y = next(iterator)
            except StopIteration:
                iterator = iter(loader)
                X, Y = next(iterator)
            with autocast():
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:
# Cell 7: Initialize model and optimizer
print("Step 3: Initializing model for pre-training...")
model = GPTLanguageModel()
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f} M")
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate_pretrain, weight_decay=0.01)

# Check GPU memory
print("Step 4: Checking GPU memory...")
if device == 'cuda':
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e6:.2f} MB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e6:.2f} MB")

# # Cell 8: Pre-training loop with updated loader calls
# print("Step 5: Starting pre-training...")
# start_time = time.time()
# best_val_loss = float('inf')
# best_model_path = 'pretrained_model.pt'
# scaler = torch.amp.GradScaler('cuda')

# for iteration in tqdm(range(max_iters_pretrain), desc="Pre-training"):
#     if iteration % eval_interval == 0 or iteration == max_iters_pretrain - 1:
#         print(f"Calling estimate_loss with train_loader: {type(train_loader)}, val_loader: {type(poem_val_loader)}")
#         losses = estimate_loss(train_loader, poem_val_loader)  # Use poem_val_loader
#         print(f"step {iteration}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
#         if losses['val'] < best_val_loss:
#             best_val_loss = losses['val']
#             torch.save({
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'val_loss': best_val_loss,
#                 'iteration': iteration
#             }, best_model_path)
#             print(f"New best model saved to {best_model_path} with val_loss {best_val_loss:.4f}")

#     optimizer.zero_grad(set_to_none=True)
#     loss_accum = 0
#     iterator = iter(train_loader)
#     for _ in range(gradient_accumulation_steps):
#         try:
#             xb, yb = next(iterator)
#         except StopIteration:
#             iterator = iter(train_loader)
#             xb, yb = next(iterator)
#         with autocast():
#             logits, loss = model(xb, yb)
#             loss = loss / gradient_accumulation_steps
#         loss_accum += loss.item()
#         scaler.scale(loss).backward()
#     scaler.unscale_(optimizer)
#     torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#     scaler.step(optimizer)
#     scaler.update()
#     torch.cuda.empty_cache()

#     if iteration % checkpoint_interval == 0 and iteration > 0:
#         checkpoint_path = f'pretrained_model_iter_{iteration}.pt'
#         torch.save({
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'val_loss': best_val_loss,
#             'iteration': iteration
#         }, checkpoint_path)
#         print(f"Checkpoint saved to {checkpoint_path}")

# print(f"Pre-training completed in {time.time() - start_time:.2f} seconds")
# gc.collect()

Step 3: Initializing model for pre-training...
Model parameters: 110.01 M
Step 4: Checking GPU memory...
GPU memory allocated: 591.57 MB
GPU memory reserved: 637.53 MB


In [8]:
# Cell 9: Fine-tuning loop
print("Step 6: Loading pre-trained model for fine-tuning...")
try:
    checkpoint = torch.load('/kaggle/input/gpt-1-pretraining-finetuning/finetuned_model.pt', map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("Pre-trained model loaded successfully")
except FileNotFoundError:
    print("Pre-trained model not found at 'pretrained_model.pt'. Starting fine-tuning with current model.")
except Exception as e:
    print(f"Error loading pre-trained model: {e}")
    raise

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate_finetune, weight_decay=0.01)

print("Step 7: Starting fine-tuning...")
start_time = time.time()
best_val_loss = float('inf')
best_finetune_path = 'finetuned_model.pt'
scaler = torch.amp.GradScaler('cuda')  # Updated to fix deprecation warning

for iteration in tqdm(range(max_iters_finetune), desc="Fine-tuning"):
    if iteration % eval_interval == 0 or iteration == max_iters_finetune - 1:
        print(f"Calling estimate_loss with train_loader: {type(poem_train_loader)}, val_loader: {type(poem_val_loader)}")
        losses = estimate_loss(poem_train_loader, poem_val_loader)  # Use poem_train_loader and poem_val_loader
        print(f"step {iteration}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': best_val_loss,
                'iteration': iteration
            }, best_finetune_path)
            print(f"New best model saved to {best_finetune_path} with val loss {best_val_loss:.4f}")

    optimizer.zero_grad(set_to_none=True)
    loss_accum = 0
    iterator = iter(poem_train_loader)  # Uses built-in iter function
    for _ in range(gradient_accumulation_steps):
        try:
            xb, yb = next(iterator)
        except StopIteration:
            iterator = iter(poem_train_loader)
            xb, yb = next(iterator)
        with autocast():
            logits, loss = model(xb, yb)
        loss = loss / gradient_accumulation_steps
        loss_accum += loss.item()
        scaler.scale(loss).backward()
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()
    torch.cuda.empty_cache()

print(f"Fine-tuning completed in {time.time() - start_time:.2f} seconds")
gc.collect()

Step 6: Loading pre-trained model for fine-tuning...
Pre-trained model loaded successfully
Step 7: Starting fine-tuning...


Fine-tuning:   0%|          | 0/10000 [00:00<?, ?it/s]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt


  with autocast():


Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 0: train loss 0.5496, val loss 0.5299
New best model saved to finetuned_model.pt with val loss 0.5299


  with autocast():
Fine-tuning:   5%|▌         | 500/10000 [14:30<4:08:11,  1.57s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 500: train loss 0.5277, val loss 0.5265
New best model saved to finetuned_model.pt with val loss 0.5265


Fine-tuning:  10%|█         | 1000/10000 [28:52<3:55:40,  1.57s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 1000: train loss 0.5473, val loss 0.5253
New best model saved to finetuned_model.pt with val loss 0.5253


Fine-tuning:  15%|█▌        | 1500/10000 [43:10<3:39:44,  1.55s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 1500: train loss 0.5166, val loss 0.5214
New best model saved to finetuned_model.pt with val loss 0.5214


Fine-tuning:  20%|██        | 2000/10000 [57:18<3:24:05,  1.53s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 2000: train loss 0.4851, val loss 0.5228


Fine-tuning:  25%|██▌       | 2500/10000 [1:11:29<3:12:32,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 2500: train loss 0.4873, val loss 0.5154
New best model saved to finetuned_model.pt with val loss 0.5154


Fine-tuning:  30%|███       | 3000/10000 [1:25:43<3:00:35,  1.55s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 3000: train loss 0.5176, val loss 0.5129
New best model saved to finetuned_model.pt with val loss 0.5129


Fine-tuning:  35%|███▌      | 3500/10000 [1:39:55<2:46:55,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 3500: train loss 0.5030, val loss 0.5126
New best model saved to finetuned_model.pt with val loss 0.5126


Fine-tuning:  40%|████      | 4000/10000 [1:54:09<2:33:52,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 4000: train loss 0.5255, val loss 0.5156


Fine-tuning:  45%|████▌     | 4500/10000 [2:08:18<2:21:15,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 4500: train loss 0.4894, val loss 0.5100
New best model saved to finetuned_model.pt with val loss 0.5100


Fine-tuning:  50%|█████     | 5000/10000 [2:22:30<2:08:21,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 5000: train loss 0.4922, val loss 0.5122


Fine-tuning:  55%|█████▌    | 5500/10000 [2:36:40<1:55:27,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 5500: train loss 0.4920, val loss 0.5048
New best model saved to finetuned_model.pt with val loss 0.5048


Fine-tuning:  60%|██████    | 6000/10000 [2:50:53<1:42:50,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 6000: train loss 0.4767, val loss 0.5030
New best model saved to finetuned_model.pt with val loss 0.5030


Fine-tuning:  65%|██████▌   | 6500/10000 [3:05:06<1:29:49,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 6500: train loss 0.4738, val loss 0.5008
New best model saved to finetuned_model.pt with val loss 0.5008


Fine-tuning:  70%|███████   | 7000/10000 [3:19:15<1:16:31,  1.53s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 7000: train loss 0.5041, val loss 0.4988
New best model saved to finetuned_model.pt with val loss 0.4988


Fine-tuning:  75%|███████▌  | 7500/10000 [3:33:29<1:04:23,  1.55s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 7500: train loss 0.5163, val loss 0.4989


Fine-tuning:  80%|████████  | 8000/10000 [3:47:40<51:31,  1.55s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 8000: train loss 0.5076, val loss 0.4972
New best model saved to finetuned_model.pt with val loss 0.4972


Fine-tuning:  85%|████████▌ | 8500/10000 [4:01:50<38:16,  1.53s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 8500: train loss 0.5004, val loss 0.4947
New best model saved to finetuned_model.pt with val loss 0.4947


Fine-tuning:  90%|█████████ | 9000/10000 [4:16:03<25:40,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 9000: train loss 0.5096, val loss 0.4933
New best model saved to finetuned_model.pt with val loss 0.4933


Fine-tuning:  95%|█████████▌| 9500/10000 [4:30:30<12:52,  1.55s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 9500: train loss 0.4946, val loss 0.4912
New best model saved to finetuned_model.pt with val loss 0.4912


Fine-tuning: 100%|█████████▉| 9999/10000 [4:44:42<00:01,  1.54s/it]

Calling estimate_loss with train_loader: <class 'torch.utils.data.dataloader.DataLoader'>, val_loader: <class 'torch.utils.data.dataloader.DataLoader'>
Processing train split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
Processing val split with loader type: <class 'torch.utils.data.dataloader.DataLoader'>, file: /kaggle/input/poems-dataset/poems_dataset.txt
step 9999: train loss 0.4866, val loss 0.4923


Fine-tuning: 100%|██████████| 10000/10000 [4:46:03<00:00,  1.72s/it]

Fine-tuning completed in 17163.58 seconds





2391