In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install bitsandbytes
!pip install accelerate
!pip install peft
!pip install -q -U trl



In [None]:
#model_name="tiiuae/falcon-7b"
model_name = "gpt2"
#model_name = "Open-Orca/OpenOrca-Platypus2-13B"
#model_name = "garage-bAInd/Platypus2-7B"
#model_name = "roberta-large"
dataset_name = "gpt_750.pt"
batch_size = 20
max_length = 750
num_epochs = 3
start_learning_rate = 2e-5
working_directory = "/content/drive/MyDrive/AISE"
import os, json
from datasets import Dataset
import bitsandbytes as bnb
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import DataCollatorForLanguageModeling

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_name)
model.to("cuda")

# Load the dataset
with open(os.path.join(working_directory, "dataset.json"), 'r', encoding='utf8') as file:
        dataset_dict = json.load(file)
dataset = []
for id in tqdm(dataset_dict, desc = "compiling dataset strings..."):
    dataset.append(dataset_dict[id]["text"])

# Tokenize the dataset
def tokenize_function(texts):
    tokenized_texts = {"input_ids": [], "attention_mask": [], "labels": []}
    for text in tqdm(texts, desc = "Tokenizing..."):
        # Tokenize the text
        tokens = tokenizer.tokenize(text, return_tensors='pt')
        # Split the tokens into chunks of max_length with one token overlap
        chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length - 1)]
        for chunk in chunks:
            # Convert the chunk to input IDs and attention mask
            input_ids = tokenizer.convert_tokens_to_ids(chunk)
            attention_mask = [1] * len(input_ids)
            # Pad the input IDs and attention mask to max_length
            while len(input_ids) < max_length:
                input_ids.append(tokenizer.pad_token_id)
                attention_mask.append(0)
            # Convert the input IDs and attention mask to tensors
            input_ids = torch.tensor(input_ids)
            attention_mask = torch.tensor(attention_mask)
# Shift the input IDs to the right to create the labels
            labels = input_ids[1:].squeeze()
            input_ids = input_ids[:-1].squeeze()
            attention_mask = attention_mask[:-1].squeeze()
            tokenized_texts["input_ids"].append(input_ids)
            tokenized_texts["attention_mask"].append(attention_mask)
            tokenized_texts["labels"].append(labels)
    # Convert the lists to tensors
    tokenized_texts["input_ids"] = torch.stack(tokenized_texts["input_ids"])
    tokenized_texts["attention_mask"] = torch.stack(tokenized_texts["attention_mask"])
    tokenized_texts["labels"] = torch.stack(tokenized_texts["labels"])
    return tokenized_texts
if os.path.exists(os.path.join(working_directory, dataset_name)):
    tokenized_dataset = torch.load(os.path.join(working_directory, dataset_name))
else:
    tokenized_dataset = tokenize_function(dataset)
    torch.save(tokenized_dataset, os.path.join(working_directory, dataset_name))
dataset = Dataset.from_dict(tokenized_dataset)
dataset = dataset.train_test_split(test_size=0.01)

# Training loop
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(working_directory, "results"),  # output directory
    num_train_epochs=1,  # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    warmup_steps=0,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir=os.path.join(working_directory, "logs"),  # directory for storing logs
    logging_steps=1000,  # when to print log
    learning_rate=start_learning_rate,  # learning rate
    max_steps = num_epochs * dataset["train"].num_rows//batch_size,
    save_strategy="steps",  # save the model every 'save_steps'
    save_steps=20000,  # number of steps between each checkpoint
    fp16 = True,
    evaluation_strategy="steps",  # evaluation is done (and logged) every eval_steps
    eval_steps=20000,  # evaluation and logging are performed every 10000 steps
)
# Initialize the Trainer
trainer = Trainer(
    model=model,  # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=dataset["train"],  # training dataset
    eval_dataset= dataset["test"],
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model(os.path.join(working_directory, "trained_model"))


compiling dataset strings...: 100%|██████████| 622444/622444 [00:00<00:00, 1313023.14it/s]
Tokenizing...:   0%|          | 0/622444 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5778 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing...: 100%|██████████| 622444/622444 [57:48<00:00, 179.47it/s]
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
