In [None]:
# if run on colab
%%capture
!git clone https://github.com/traeuker/CreateGPT.git
%cd /content/CreateGPT
!pip install -r requirements.txt

In [5]:
import os 
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# CreateGPT

In [1]:
import gpt_model
import sample
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## Load pre-trained weights 

You need a model with the same architecture as the pre-trained model

You also need the same tokenizer as the pre-trained model to fit vocab size and special tokens


In [2]:
import load_weights

transformer_config = gpt_model.TransformerConfig(
    num_layers=12,
    num_heads=12,
    vocab_size=50257,
    hidden_size=768,
    max_seq_len=1024,
    dropout=0.1,
    layer_norm_epsilon=1e-5
)

model = gpt_model.DecoderOnlyTransformer(config=transformer_config)
model = load_weights.copy_weights(model)

tokenizer = load_weights.get_tokenizer()

In [3]:
# test if weights were copied correctly
text = "My research interests touch several areas of Machine Learning, Signal Processing,"
output = sample.sample_tokens(model=model, initial_text=text, tokenizer=tokenizer, max_tokens_generated=30, temperature=1.0, top_k=20)
print(output)
# Does the output look like a somewhat reasonable continuation of the input text?

My research interests touch several areas of Machine Learning, Signal Processing, and Machine Learning as well as the development and implementation of Artificial Intelligence, which are all aspects of our research. As such, we are working on many


## Train your model

You can train your model with your own dataset!

You only need a text file with enough data (and ideally enough compute that you are not limited by training time)

In [4]:
%%capture
from urllib.request import urlopen
import data_processing
import load_weights

# Url to a text file 
url = "https://www.gutenberg.org/cache/epub/100/pg100.txt"
data = urlopen(url).read().decode('utf-8')
data = " ".join(data.split())

# fraction of the dataset to use, using all of it may take a long time
fraction_of_dataset = 0.001


In [None]:
# Either load the dataset with the GPT2 tokenizer
tokenizer = load_weights.get_tokenizer()
dataset = data_processing.WordsDatasetTokenized(dataset=data, tokenizer=tokenizer, fraction=fraction_of_dataset)

In [None]:
# or create your own dataset and create your tokenizer from it 
import re
words = re.split(r"\b", data)
dataset = data_processing.WordsDataset(words=words, fraction=fraction_of_dataset)
tokenizer = data_processing.WordsTokenizer(dataset)

In [5]:
# train model on the dataset
model = gpt_model.DecoderOnlyTransformer(config=transformer_config)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()
trainloader, testloader = data_processing.get_dataloaders(
    dataset, batch_size=4)
trained_model = gpt_model.train(model, optimizer, trainloader, testloader, loss_fn, num_epochs=1,
                                save_dir="/Users/tilman/Documents/projects/arena/arena/CreateGPT2/CreateGPT/models", 
                                device=device, WANDB=False)


Beginning Training
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 0 Training Loss 12.6517: 100%|██████████| 291/291 [1:57:17<00:00, 24.18s/it]


Epoch 0 Loss: 34.9738
Test Accuracy: 0.1294
Training complete in 117m 28s


In [None]:
# test if training worked
text = "My research interests touch several areas of Machine Learning, Signal Processing "
output = sample.sample_tokens(model=trained_model, initial_text=text, tokenizer=tokenizer, max_tokens_generated=30, temperature=0.9, top_k=20)
print(output)
# Does the output look like a somewhat reasonable continuation of the input text?