# setup

In [2]:
!pip install datasets tqdm

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/547.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

# data

In [5]:
from datasets import load_dataset
dataset = load_dataset('EleutherAI/pile', split='train', streaming=True)

In [15]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function)


In [16]:
import torch
from torch.utils.data import DataLoader, IterableDataset

class StreamingDataset(IterableDataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __iter__(self):
        for item in self.dataset:
            yield item

streaming_dataset = StreamingDataset(tokenized_datasets)
train_dataloader = DataLoader(streaming_dataset, batch_size=8)


# model

In [17]:
import torch
from torch import nn

class SimpleTransformer(nn.Module):
    def __init__(self, d_model, d_ff, vocab_size):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=8, dim_feedforward=d_ff, activation='relu')
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=1)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.transformer(x)
        x = self.fc(x)
        return x

# Model parameters
d_model = 128
d_ff = 512
vocab_size = 50257  # Size of tokenizer vocabulary (e.g., GPT-2 tokenizer size)

model = SimpleTransformer(d_model=d_model, d_ff=d_ff, vocab_size=vocab_size).cuda()




In [13]:
!pip install zstandard

Collecting zstandard
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.22.0


In [18]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

# Training parameters
num_epochs = 1
learning_rate = 5e-5
max_steps = 10000  # Set a fixed number of training steps

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=max_steps)

# Training loop
model.train()
step = 0
for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        if step >= max_steps:
            break

        input_ids = batch['input_ids'].cuda()
        labels = batch['input_ids'].cuda()  # Use the input as labels for language modeling

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = nn.CrossEntropyLoss()(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
        scheduler.step()

        step += 1

    if step >= max_steps:
        break

    print(f"Epoch {epoch+1}/{num_epochs} completed. Loss: {loss.item()}")


0it [00:00, ?it/s]


FileNotFoundError: https://the-eye.eu/public/AI/pile/train/00.jsonl.zst