In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import argparse
from torch.optim import AdamW

In [2]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'


batch_size = 32
block_size = 128
max_iters = 10000
learning_rate = 2e-5
eval_iters = 100
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2

In [3]:
import model
import Data_preprocess

In [10]:
from model import GPTLanguageModel

In [11]:
vocab_size,train,test=Data_preprocess.loader()

In [12]:
model_object = GPTLanguageModel(vocab_size)

m = model_object.to(device)

In [None]:
@torch.no_grad()
def estimate_loss():
    model_object.eval()
    train_loss = 0.0
    val_loss = 0.0
    with torch.no_grad():
        for _ in range(eval_iters):
            xb_train, yb_train = Data_preprocess.get_batch(train)
            _, loss = model_object(xb_train, yb_train)
            train_loss += loss.item()
            
            xb_val, yb_val = Data_preprocess.get_batch(test)
            _, val_loss_batch = model_object(xb_val, yb_val)
            val_loss += val_loss_batch.item()
    model_object.train()
    return {"train": train_loss / eval_iters, "val": val_loss / eval_iters}

In [None]:
optimizer = AdamW(model_object.parameters(), lr=learning_rate)


for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"Step: {iter}, Train Loss: {losses['train']:.3f}, Val Loss: {losses['val']:.3f}")
    

    xb, yb = Data_preprocess.get_batch(train)


    logits, loss = model_object(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"Final Loss: {loss.item()}")


with open('trial_3_10000iter.pkl', 'wb') as f:
    pickle.dump(model_object, f)
print('Model saved as trial_3_10000iter.pkl')

step: 0, train loss: 1.194, val loss: 1.439
step: 100, train loss: 1.194, val loss: 1.438
step: 200, train loss: 1.190, val loss: 1.434
step: 300, train loss: 1.188, val loss: 1.434
step: 400, train loss: 1.187, val loss: 1.434
step: 500, train loss: 1.182, val loss: 1.441
step: 600, train loss: 1.183, val loss: 1.438
step: 700, train loss: 1.182, val loss: 1.433
step: 800, train loss: 1.181, val loss: 1.437
step: 900, train loss: 1.178, val loss: 1.435
step: 1000, train loss: 1.179, val loss: 1.434
step: 1100, train loss: 1.177, val loss: 1.432
step: 1200, train loss: 1.175, val loss: 1.437
step: 1300, train loss: 1.177, val loss: 1.431
step: 1400, train loss: 1.172, val loss: 1.428
step: 1500, train loss: 1.167, val loss: 1.434
step: 1600, train loss: 1.168, val loss: 1.438
step: 1700, train loss: 1.168, val loss: 1.442
step: 1800, train loss: 1.171, val loss: 1.434
step: 1900, train loss: 1.159, val loss: 1.434
step: 2000, train loss: 1.165, val loss: 1.433
step: 2100, train loss: 1