In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install torch
!pip install numpy
!pip install transformers
!pip install datasets
!pip install tqdm

!pip install trl
!pip install peft

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
Inst

In [None]:
tokenizer_name = "gpt2"
model_name = "gpt2v1"
dataset_name = "gpt_750.pt"
batch_size = 20
max_length = 750
num_epochs = 20
start_learning_rate = 3e-6
working_directory = "/content/drive/MyDrive/AISE"
import os, json, peft
from peft import LoraConfig
from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
)
from tqdm import tqdm

import torch
import numpy as np
import random
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import DataCollatorForLanguageModeling

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(1337)  # You can choose any number as the seed

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(os.path.join(working_directory, model_name))
peft_config = LoraConfig(
    lora_alpha=8,  # Lower value for more parameters and potentially higher performance
    lora_dropout=0.05,  # Lower value for less regularization and potentially higher performance
    r=128,  # Higher value for more accurate approximation
    bias="none",  # Including a bias term can potentially improve performance
    task_type="CAUSAL_LM",  # Match this with your task
)
model = peft.get_peft_model(model, peft_config)
model.to("cuda")

# Load the dataset
with open(os.path.join(working_directory, "ft_examples.json"), 'r', encoding='utf8') as file:
        text_list = json.load(file)[:10000]

dataset = Dataset.from_dict({"text": text_list})
dataset = dataset.train_test_split(test_size=0.1)

# Training loop
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(working_directory, "results"),  # output directory
    num_train_epochs=num_epochs,  # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    warmup_steps=0,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir=None,
    logging_steps=100,  # when to print log
    learning_rate=start_learning_rate,  # learning rate
    max_steps = num_epochs * dataset["train"].num_rows//batch_size,
    save_strategy="epoch",  # save the model every 'save_steps'
    fp16 = True,
    evaluation_strategy="epoch",  # evaluation is done (and logged) every eval_steps
)
# Initialize the Trainer
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,  # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=dataset["train"],  # training dataset
    dataset_text_field="text",
    max_seq_length=max_length,  # adjust as needed
    tokenizer = tokenizer,
    peft_config = peft_config,
    eval_dataset= dataset["test"],
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model(os.path.join(working_directory, "gpt2v2"))




Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
