In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchinfo import summary
from tqdm import tqdm
import tiktoken
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils import clip_grad_norm_
import warnings

warnings.filterwarnings("ignore")
encoding = tiktoken.get_encoding("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

# HyperParameters
vocab_length = 50304
sequence_length = 128
embedding_dimension = 256
batch_size = 32
current_epoch = 32
epochs = 100
initial_lr = 3e-4
min_lr = 1e-4
num_head = 8
num_layer = 6
dropout = 0.2
mlp_factor = 4

class SingleHead(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.hid_dim = hid_dim
        self.Wquery = nn.Linear(embedding_dimension, hid_dim, bias=False)
        self.Wkey = nn.Linear(embedding_dimension, hid_dim, bias=False)
        self.Wvalue = nn.Linear(embedding_dimension, hid_dim, bias=False)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        Q = self.Wquery(x)
        K = self.Wkey(x)
        V = self.Wvalue(x)

        out = F.scaled_dot_product_attention(Q, K, V, dropout_p=dropout, is_causal=True)

        return out

class MultiHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.heads = nn.ModuleList([SingleHead(embedding_dimension // num_head) for _ in range(num_head)])
        self.project = nn.Linear(embedding_dimension, embedding_dimension)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.project(out)
        out = self.drop(out)
        return out

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dimension, mlp_factor * embedding_dimension),
            nn.GELU(),
            nn.Linear(mlp_factor * embedding_dimension, embedding_dimension),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.mlp(x)

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.mha = MultiHead()
        self.ff = FeedForward()
        self.norm1 = nn.LayerNorm(embedding_dimension)
        self.norm2 = nn.LayerNorm(embedding_dimension)

    def forward(self, x):
        x = x + self.mha(self.norm1(x))
        x = x + self.ff(self.norm2(x))
        return x

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_length, embedding_dimension)
        self.positional_embedding = nn.Embedding(sequence_length, embedding_dimension)
        self.blocks = nn.Sequential(*[Block() for _ in range(num_layer)])
        self.norm = nn.LayerNorm(embedding_dimension)
        self.fc = nn.Linear(embedding_dimension, vocab_length)
        self.fc.weight = self.embedding.weight

    def forward(self, x):
        B, T = x.shape
        token_embedding=self.embedding(x)
        pos_embedding=self.positional_embedding(torch.arange(T, device=device))
        token_embedding+=pos_embedding

        out=self.blocks(token_embedding)
        out=self.norm(out)
        logits=self.fc(out)
        logits=logits.view(-1, vocab_length)
        return logits

    def generate(self,x,temp=1.0,max_length=2000):
        x=x.to(device)
        with torch.no_grad():
            for _ in range(max_length):
                t=x.shape[1]
                idx_cnt=x[:,-sequence_length:]
                logits=self.forward(idx_cnt)
                logits=logits.reshape(1,idx_cnt.shape[1],-1)
                prob=F.softmax(logits[:, -1, :]/temp, dim=1)
                next_token = torch.multinomial(prob, num_samples=1)
                idx_cnt=torch.cat((idx_cnt,next_token),dim=1)
                if next_token.item()==50256:
                    break
                if next_token.item()<50256:
                    print(encoding.decode([next_token.item()]),end='')


In [None]:
with open('tokenswiki.json', 'r') as file:
    token = json.load(file)

with open('loss.json', 'r') as file:
    LOSS=json.load(file)

class SequenceDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = torch.tensor(data, dtype=torch.long)
        self.seq_length = seq_length
        self.num_samples = len(self.data) // seq_length  # Number of full sequences

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        start = idx * (self.seq_length // 2)  # Overlapping sequences
        end = start + self.seq_length
        if end > len(self.data):  # Prevent out-of-bounds access
            end = len(self.data)
            start = end - self.seq_length if end - self.seq_length >= 0 else 0
        x = self.data[start:end - 1]
        y = self.data[start + 1:end]
        return x, y

# Assuming sequence_length, batch_size, device, etc., are defined elsewhere
dataset = SequenceDataset(token, sequence_length)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

if __name__ == '__main__':
    model = GPT().to(device)  # Assuming GPT is defined

    print(summary(model))

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=initial_lr)  # initial_lr defined elsewhere

    # Use CosineAnnealingLR scheduler (lambda_func was unused in original code)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=min_lr)

    weight_path = f'saved_weight{current_epoch}.pth'  # current_epoch defined elsewhere
    if os.path.exists(weight_path):
        model.load_state_dict(torch.load(weight_path))
        tqdm.write("Loaded pre-trained weights!")

    # Mixed precision training setup
    scaler = torch.cuda.amp.GradScaler() if device.type == 'cuda' else None

    # Training loop
    for epoch in range(current_epoch, epochs):  # epochs defined elsewhere
        # Training phase
        model.train()
        total_train_loss = 0.0
        train_steps = 0
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False)

        for batch_idx, (x, y) in enumerate(train_bar):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            if scaler:
                with torch.cuda.amp.autocast():
                    outputs = model(x)
                    logits = outputs.view(-1, vocab_length)  # vocab_length defined elsewhere
                    y_flat = y.view(-1)
                    loss = criterion(logits, y_flat)
                scaler.scale(loss).backward()
                clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(x)
                logits = outputs.view(-1, vocab_length)
                y_flat = y.view(-1)
                loss = criterion(logits, y_flat)
                loss.backward()
                clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

            total_train_loss += loss.item()
            train_steps += 1
            avg_train_loss = total_train_loss / train_steps
            train_bar.set_postfix({"Train Loss": f"{avg_train_loss}"})

        # Validation phase (moved outside batch loop)
        model.eval()
        total_val_loss = 0.0
        val_steps = 0
        val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]", leave=False)
        with torch.no_grad():
            for x, y in val_bar:
                x, y = x.to(device), y.to(device)
                outputs = model(x)
                logits = outputs.view(-1, vocab_length)
                y_flat = y.view(-1)
                loss = criterion(logits, y_flat)
                total_val_loss += loss.item()
                val_steps += 1
                avg_val_loss = total_val_loss / val_steps
                val_bar.set_postfix({"Val Loss": f"{avg_val_loss}"})

        # Display epoch results using tqdm.write
        avg_train_loss = total_train_loss / train_steps
        avg_val_loss = total_val_loss / val_steps
        tqdm.write(f"Epoch {epoch+1} - Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}")
        LOSS.append(avg_train_loss)
        LOSS.append(avg_val_loss)
        # Update learning rate and display
        scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']
        tqdm.write(f"Epoch {epoch+1} complete. Current LR: {current_lr}")

        # Save checkpoint
        checkpoint_path = f'saved_weight{epoch+1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        with open('loss.json', 'w') as file:
            json.dump(LOSS, file)
        tqdm.write(f"Checkpoint saved: {checkpoint_path}")

    tqdm.write("Training Complete")

In [None]:
# Load model and weights
model = GPT().to(device)
model.eval()

weight_path = f'saved_weight.pth'
if os.path.exists(weight_path):
    model.load_state_dict(torch.load(weight_path,map_location=torch.device('cpu')))
else:
    print("Warning: No pre-trained weights found!")

# Example usage
encoding = tiktoken.get_encoding("gpt2")
prompt=f'Owen Mercer first'
print(prompt,end='')
prompt=encoding.encode(prompt)
#prompt.insert(0,5641) #<user-start>
#prompt.append(49476) #<user-end>
model.generate(torch.tensor([prompt], dtype=torch.long, device=device),temp=1)