In [None]:
config = {
    "hyperparameters": {
        "batch_size": 2,
        "epochs": 5,
        "learning_rate": 5e-4,
        "warmup_steps": 1e2,
        "epsilon": 1e-08,
        "sample_every": 200,
        "seed_val" : 42
    }
}

In [None]:
import wandb
wandb.login()

True

In [None]:
!nvidia-smi

Wed Jan 17 14:45:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0              32W /  70W |   6095MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

# loading
# config = OmegaConf.load("project_name/config.yaml")

lr = config["hyperparameters"]["learning_rate"]
eps = config["hyperparameters"]["epsilon"]


class Model(torch.nn.Module):
    def __init__(self, lr=lr, eps=eps, model_version="gpt2"):
        super().__init__()

        # Load the GPTModel
        configuration = GPT2Config.from_pretrained(model_version, output_hidden_states=False)
        self.model = GPT2LMHeadModel.from_pretrained(model_version, config=configuration)
        # Load the GPT tokenizer.
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            model_version, bos_token="<|startoftext|>", eos_token="<|endoftext|>", pad_token="<|pad|>"
        )  # gpt2-medium)
        # Setting parameters
        self.lr = lr
        self.epsilon = eps

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, token_type_ids=None)
        return outputs

    def resize_token_embeddings(self, size):
        return self.model.resize_token_embeddings(size)

    def generate(self, inputs=None, bos_token_id=None, max_output_length=200, num_return_sequences=1):
        # Use the generate method from the GPTmodel model
        generated_text = []
        sample_outputs = self.model.generate(
            inputs=inputs,
            bos_token_id=bos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            max_length=max_output_length,
            top_p=0.90,
            num_return_sequences=num_return_sequences,
        )
        for sample_output in sample_outputs:
            generated_text.append(self.tokenizer.decode(sample_output, skip_special_tokens=True))
        return generated_text

    def configure_optimizers(self):
        return AdamW(self.model.parameters(), lr=self.lr, eps=self.epsilon)

    def configure_scheduler(self, num_warmup_steps, num_training_steps):
        return get_linear_schedule_with_warmup(
            self.configure_optimizers(), num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
        )

    def save_model(self, output_dir):
        # Save the model state dictionary
        self.model.save_pretrained(output_dir)
        # Save the tokenizer
        self.tokenizer.save_pretrained(output_dir)

        print("Saving model to %s" % output_dir)


In [None]:
import torch
from torch.utils.data import Dataset


class GPT2Dataset(Dataset):
    def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:
            encodings_dict = tokenizer(
                "<|startoftext|>" + txt + "<|endoftext|>", truncation=True, max_length=max_length, padding="max_length"
            )

            self.input_ids.append(torch.tensor(encodings_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encodings_dict["attention_mask"]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [None]:
from datasets import load_dataset
import pandas as pd


def get_data():
    dataset = load_dataset("izumi-lab/open-text-books")
    return dataset


def process_data(rawDataset):
    dataset = pd.DataFrame(rawDataset)

    array_of_books = []
    for a in dataset["train"]:
        array_of_books.append(a["text"])

    df = pd.DataFrame(array_of_books, columns=["text"])

    df.dropna(inplace=True)  # remove NA values
    return df


data = process_data(get_data())


In [None]:
import os
import time
import torch
import wandb
import random
import datetime
import pandas as pd
from torch.utils.data import DataLoader, random_split, RandomSampler, SequentialSampler


# loading
# config = OmegaConf.load("project_name/config.yaml")

parameter = {
    "epochs": config["hyperparameters"]["epochs"],
    "learning_rate": config["hyperparameters"]["learning_rate"],
    "warmup_steps": config["hyperparameters"]["warmup_steps"],
    "epsilon": config["hyperparameters"]["epsilon"],
    "batch_size": config["hyperparameters"]["batch_size"],
    # this produces sample output every 100 steps
    "sample_every": config["hyperparameters"]["sample_every"],
    "seed_val" :config["hyperparameters"]["seed_val" ]
}


def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


def dataloader(tokenizer, batch_size):
    df = data.sample(n=3000, random_state=parameter["seed_val"])
    dataset = GPT2Dataset(df['text'], tokenizer, max_length=768)
    print(len(dataset))

    # Split into training and validation sets
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    print("{:>5,} training samples".format(train_size))
    print("{:>5,} validation samples".format(val_size))

    # Create the DataLoaders for our training and validation datasets.
    # We'll take training samples in random order.
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size,  # Trains with this batch size.
    )

    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloader = DataLoader(
        val_dataset,  # The validation samples.
        sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
        batch_size=batch_size,  # Evaluate with this batch size.
    )
    return train_dataloader, validation_dataloader


In [None]:
def train():

    random.seed(parameter["seed_val"])
    torch.manual_seed(parameter["seed_val"])
    torch.cuda.manual_seed_all(parameter["seed_val"])
    # Saving loss
    training_stats = []

    total_t0 = time.time()
    wandb.init(project="mlops_g30",config= parameter)
    # A GPT model with arguments "lr" of learning rate & "eps" of epsilon
    model = Model(lr=parameter["learning_rate"], eps=parameter["epsilon"])
    tokenizer = model.tokenizer

    # Tell pytorch to run this model on the GPU.
    device = torch.device("cuda")
    model = model.to(device)
    #self.model.cuda()

    # Load the data
    train_dataloader, valid_dataloader = dataloader(tokenizer, parameter["batch_size"])


    # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
    # otherwise the tokenizer and model tensors won't match up
    model.resize_token_embeddings(len(tokenizer))

    # Total number of training steps is [number of batches] x [number of epochs].
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * parameter["epochs"]
    # Create the optimizer(AdamW)
    optimizer = model.configure_optimizers()
    # Create the learning rate scheduler.
    # This changes the learning rate as the training loop progresses
    scheduler = model.configure_scheduler(num_warmup_steps=parameter["warmup_steps"], num_training_steps=total_steps)

    for epoch_i in range(0, parameter["epochs"]):
        # ========================================
        #               Training
        # ========================================
        print("")
        print("======== Epoch {:} / {:} ========".format(epoch_i + 1, parameter["epochs"]))
        print("Training...")

        # Save the starting training time for epoch
        t0 = time.time()
        total_train_loss = 0

        # start training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            loss = batch_train(model, device, batch)
            batch_loss = loss.item()
            total_train_loss += batch_loss

            # Get sample every x batches.
            if step % parameter["sample_every"] == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print(
                    "  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.".format(
                        step, len(train_dataloader), batch_loss, elapsed
                    )
                )
                wandb.log({"Batch Loss": batch_loss})
                sample(model)

            loss.backward()
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================

        print("")
        print("Running Validation...")
        avg_val_loss, validation_time = valid(model, device, valid_dataloader)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                "epoch": epoch_i + 1,
                "Training Loss": avg_train_loss,
                "Valid. Loss": avg_val_loss,
                "Training Time": training_time,
                "Validation Time": validation_time,
            }
        )
        # log metrics to wandb
        wandb.log({"epoch": epoch_i + 1,
                    "Training Loss": avg_train_loss,
                    "Valid. Loss": avg_val_loss})

    print("")
    print("Training complete!")
    print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

    # ========================================
    #               Save
    # ========================================
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    output_dir = "models/"

    # Create output directory if needed
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model.save_model(output_dir)
    wandb.finish()

def batch_train(model, device, batch):
    b_input_ids = batch[0].to(device)
    b_labels = batch[0].to(device)
    b_masks = batch[1].to(device)

    model.zero_grad()

    outputs = model(input_ids=b_input_ids, labels=b_labels, attention_mask=b_masks)
    loss = outputs[0]

    return loss


def sample(model):
    model.eval()

    sample_outputs = model.generate(bos_token_id=random.randint(1, 30000), num_return_sequences=3)
    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}".format(i, sample_output))

    model.train()


def valid(model, device, valid_dataloader):
    t0 = time.time()

    model.eval()

    total_eval_loss = 0

    # Evaluate data for one epoch
    for batch in valid_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, attention_mask=b_masks, labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(valid_dataloader)

    validation_time = format_time(time.time() - t0)
    return avg_val_loss, validation_time


if __name__ == "__main__":
    train()


3000
2,700 training samples
  300 validation samples

Training...




  Batch   200  of  1,350. Loss: 3.370478868484497.   Elapsed: 0:01:44.
0:  bipartisanA said a person may have a serious problem with the issue. This could be that they are engaged in a serious problem with their relationship between the person and the person, especially if the situation arises of a child or a child.
A person may have a significant problem with the relationship between the person and the person because of the person.

A person may have a serious problem with the relationship between the person and the person, especially if the situation arises from a child or a child.
A person may not have a significant problem with the relationship between the person and the person because of the person or the relationship between the person and the person.
A person may have a serious problem with the relationship between the person and the person, especially if the situation arises from a child or a child.
The person and the relationship between the person and the person may have a se

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Batch Loss,█▅▁▁▅▂▇▁▂▄▁▁
Training Loss,█▁
Valid. Loss,▁█
epoch,▁█

0,1
Batch Loss,0.49434
Training Loss,1.23138
Valid. Loss,1.33792
epoch,2.0


In [None]:

output_dir = "models/"

# Encode a text inputs
text = "What is the fastest car in the"

model = Model(model_version=output_dir)

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()



def make_prediction(model, input_prompt, max_output_length):
    """

    Args:
        model: Model used for making predictions
        input_prompt (str): input prompt for the model, that will be used to generate text
        max_output_length (int): The maximum length for the

    Returns:
        prediction (str): input + text generated by the model in a string

    """

    indexed_tokens = model.tokenizer.encode(input_prompt)
    tokens_tensor = torch.tensor([indexed_tokens])
    prediction = model.generate(inputs=tokens_tensor, max_output_length=max_output_length, num_return_sequences=1)

    return prediction[0]

# Print the predicted word
print(make_prediction(model,text,300))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r models /content/drive/MyDrive/dtu_mlops