In [1]:
from numpy import dtype
import torch
import argparse
from pytorch_lightning import Trainer
from lightning.pytorch.loggers import WandbLogger
import os
from train_utils import load_model_tokenizer, DataModule, TrainingModule

%load_ext autoreload
%autoreload 2

In [7]:
def train(args):
    model, tokenizer = load_model_tokenizer(args['model_name'])
    torch.set_float32_matmul_precision("medium")
    # wandb_logger = WandbLogger(
    #     log_model="all",
    #     name=f"{args['model_name']}_local",
    #     project="tom",
    # )

    datamodule = DataModule(
        tokenizer,
        args['data_path'],
        args['batch_size'],
        model.config.n_positions,
        args['seed'],
    )
    model = TrainingModule(
        model,
        tokenizer,
        seed=args['seed'],
        output_dir=args['output_dir'],
        lr=args['lr'],
        eval_metric=datamodule.eval_metric(),
    )

    trainer = Trainer(
        default_root_dir=args["output_dir"],
        devices=args["devices"],
        max_epochs=args["epochs"],
        precision="32-true",
        gradient_clip_val=1.0,
        deterministic=True,
        accumulate_grad_batches=args["accumulate_grad_batches"],
        accelerator="gpu",
        enable_checkpointing=False,
        val_check_interval=0.1,
        strategy="ddp_notebook",
        log_every_n_steps=1,
    )

    trainer.fit(model, datamodule)
    # trainer.test(model, datamodule.test_dataloader())
    trainer.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")

In [8]:
args = {
    "model_name": "gpt2",
    "data_path": "training_data_small.jsonl",
    "seed": 0,
    "output_dir": "results",
    "lr": 1e-4,
    "devices": [0],
    "accumulate_grad_batches": 1,
    "epochs": 1,
    "batch_size": 1,
}

train(args)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
    results = function(*args, **kwargs)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 943, in _run
    self.strategy.setup_environment()
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 153, in setup_environment
    super().setup_environment()
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 129, in setup_environment
    self.accelerator.setup_device(self.root_device)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/pytorch_lightning/accelerators/cuda.py", line 46, in setup_device
    _check_cuda_matmul_precision(device)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/lightning_fabric/accelerators/cuda.py", line 361, in _check_cuda_matmul_precision
    if not torch.cuda.is_available() or not _is_ampere_or_later(device):
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/lightning_fabric/accelerators/cuda.py", line 355, in _is_ampere_or_later
    major, _ = torch.cuda.get_device_capability(device)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/torch/cuda/__init__.py", line 357, in get_device_capability
    prop = get_device_properties(device)
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/torch/cuda/__init__.py", line 371, in get_device_properties
    _lazy_init()  # will define _get_device_properties
  File "/home/local_nikhil/.conda/envs/anima/lib/python3.10/site-packages/torch/cuda/__init__.py", line 217, in _lazy_init
    raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method


In [None]:
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="llama")

    parser.add_argument(
        "--data_path",
        type=str,
        default="/home/local_nikhil/Projects/PositionLens/datasets/gpt2/position_detector.json",
    )

    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--output_dir", type=str, default="./weights")
    parser.add_argument("--devices", type=list, default=[0])
    parser.add_argument("--accumulate_grad_batches", type=int, default=1)
    parser.add_argument("--lr", type=float, default=1e-4)
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--seed", type=int, default=10)

    args = parser.parse_args()
    args.output_dir = os.path.join(args.output_dir, args.model_name)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    train(args)

# Load Model and Tokenizer

In [2]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Load Dataset

Downloading and preparing dataset json/default to /home/local_nikhil/.cache/huggingface/datasets/json/default-7eb05ebb7d4c0727/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/local_nikhil/.cache/huggingface/datasets/json/default-7eb05ebb7d4c0727/0.0.0. Subsequent calls will reuse this data.


In [15]:
data = load_dataset("json", data_files="training_data.jsonl")

Downloading and preparing dataset json/default to /home/local_nikhil/.cache/huggingface/datasets/json/default-c2c79431c022fd19/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/local_nikhil/.cache/huggingface/datasets/json/default-c2c79431c022fd19/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
TEMPLATE = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the dialog using provided information.\n### Input:{input}\n### Response:"

In [19]:
def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=model.config.max_position_embeddings,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < model.config.max_position_embeddings
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result


def generate_and_tokenize_prompt(data_point, train_on_inputs=True, add_eos_token=False):
    full_prompt = TEMPLATE.format(input=data_point["input"])

    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = TEMPLATE.format(data_point["input"])
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt

In [20]:
val_size = 0.1
train_val = data.train_test_split(test_size=val_size, seed=0)
train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [11]:
# Create a DataLoader that will return batches of data
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)