In [39]:
# %%
import os
from os import path
import pandas as pd
import numpy as np
import glob
import math

# %%
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, GenerationConfig
from finetune import create_datasets, ConstantLengthDataset, chars_token_ratio, run_training, prepare_model_for_int8_training, print_trainable_parameters
from finetune import SavePeftModelCallback, LoadBestPeftModelCallback
from tqdm import tqdm

# %%
from datasets import load_dataset
from datasets import Dataset, Features, Value


from accelerate import Accelerator
from peft import LoraConfig, get_peft_model

# Definitions

In [19]:
HOME=os.path.expanduser('~')

DATASET_ID="life2scenario_medium"
LIFE2SCENARIO_ROOT_PATH=path.join(HOME,"Documents/life2scenario_core/")
DATASET_ROOT_PATH=path.join(LIFE2SCENARIO_ROOT_PATH,f"datasets/{DATASET_ID}")

print(DATASET_ROOT_PATH)

/mnt/home/yucedago/Documents/life2scenario_core/datasets/life2scenario_medium


In [26]:
# %%
PREP_PICKLES_ROOT=path.join(DATASET_ROOT_PATH, f"prep_pickles/")
print(PREP_PICKLES_ROOT)

ORIG_MODEL = "bigcode/starcoderbase-3b"
CHECKPOINT="/mnt/home/yucedago/Documents/life2scenario_core/starcoder/bigcode/starcoderbase_3b_40ep"

/mnt/home/yucedago/Documents/life2scenario_core/datasets/life2scenario_medium/prep_pickles/


# Checkpoint Load

In [4]:
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [5]:
def get_gpus_max_memory(max_memory):
    max_memory = {i: max_memory for i in range(torch.cuda.device_count())}
    return max_memory

tokenizer = AutoTokenizer.from_pretrained(ORIG_MODEL, load_in_8bit=True)
# to save memory consider using fp16 or bf16 by specifying torch_dtype=torch.float16 for example
model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, torch_dtype=torch.float16,
        use_auth_token=True,
        load_in_8bit=True,
        max_memory=get_gpus_max_memory("70GB"))



In [6]:
model

GPTBigCodeForCausalLM(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 2816)
    (wpe): Embedding(8192, 2816)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPTBigCodeBlock(
        (ln_1): LayerNorm((2816,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeAttention(
          (c_attn): Linear8bitLt(in_features=2816, out_features=3072, bias=True)
          (c_proj): Linear8bitLt(in_features=2816, out_features=2816, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2816,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear8bitLt(in_features=2816, out_features=11264, bias=True)
          (c_proj): Linear8bitLt(in_features=11264, out_features=2816, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPTBigCodeBloc

In [7]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<fim_prefix>',
  '<fim_middle>',
  '<fim_suffix>',
  '<fim_pad>',
  '<filename>',
  '<gh_stars>',
  '<issue_start>',
  '<issue_comment>',
  '<issue_closed>',
  '<jupyter_start>',
  '<jupyter_text>',
  '<jupyter_code>',
  '<jupyter_output>',
  '<empty_output>',
  '<commit_before>',
  '<commit_msg>',
  '<commit_after>',
  '<reponame>']}

# Load the dataset

In [24]:
train_final = pd.read_csv(path.join(PREP_PICKLES_ROOT, "train_dataset.csv"))

train_final = train_final[["request", "response"]]
train_final.head()

Unnamed: 0,request,response
0,i would like you to add pedestrian close to st...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
1,please add pedestrian close to standing?\n```\...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
2,i would like you to add pedestrian close to st...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
3,could you remove pedestrian actor named pedest...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
4,"add pedestrian at location (-85.7, -69.63, 0.5...","Here is the result:\n```\n<?xml version=""1.0"" ..."


In [25]:
# %%
life2scenario_dataset = Dataset.from_pandas(
  train_final,
  features=Features(
    {'request': Value('string'),
     'response': Value('string')
    })
)


l2s_dataset = life2scenario_dataset.train_test_split(test_size=0.01)

# Evaluate

In [35]:

def prepare_sample_text(example, input_column_name="prompt", output_column_name="completion"):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example[input_column_name]}\n\nAnswer: {example[output_column_name]}"
    return text


def chars_token_ratio(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example, input_column_name, output_column_name)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


In [46]:
class Dict2Obj(object):
  def __init__(self, dictionary):
    for key in dictionary:
        setattr(self, key, dictionary[key])
  
  def __repr__(self):
    return "<dict2obj: %s>" % self.__dict__
  
# Eval Params
eval_dict = {
    "model_path": CHECKPOINT,
    "subset": "data/finetune",
    "streaming": True,
    "seq_length": 8000,
    "max_steps": 1000,
    "batch_size": 1,
    "input_column_name": "request",
    "output_column_name": "response",
    "gradient_accumulation_steps": 16,
    "learning_rate": 1e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 10,
    "weight_decay": 0.05,
    "output_dir": "./checkpoints_vx",

    "local_rank": 0,
    "eos_token_id": 49152,
    "no_gradient_checkpointing": False,
    "shuffle_buffer": 5000,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "no_fp16": False,
    "bf16":False,
    "seed": 0,
    "num_workers": 32,
    "log_freq": 1,
    "eval_freq":20,
    "save_freq": 20
  }

eval_args = Dict2Obj(eval_dict)
eval_args

<dict2obj: {'model_path': '/mnt/home/yucedago/Documents/life2scenario_core/starcoder/bigcode/starcoderbase_3b_40ep', 'subset': 'data/finetune', 'streaming': True, 'seq_length': 8000, 'max_steps': 1000, 'batch_size': 1, 'input_column_name': 'request', 'output_column_name': 'response', 'gradient_accumulation_steps': 16, 'learning_rate': 0.0001, 'lr_scheduler_type': 'cosine', 'num_warmup_steps': 10, 'weight_decay': 0.05, 'output_dir': './checkpoints_vx', 'local_rank': 0, 'eos_token_id': 49152, 'no_gradient_checkpointing': False, 'shuffle_buffer': 5000, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'no_fp16': False, 'bf16': False, 'seed': 0, 'num_workers': 32, 'log_freq': 1, 'eval_freq': 20, 'save_freq': 20}>

In [47]:
hf_test_data = l2s_dataset["test"]


chars_per_token = chars_token_ratio(hf_test_data, tokenizer, eval_args.input_column_name, eval_args.output_column_name)
print(f"chars_per_token: {chars_per_token}")


valid_dataset = ConstantLengthDataset(
    tokenizer,
    hf_test_data,
    infinite=False,
    
    seq_length=eval_args.seq_length,
    chars_per_token=chars_per_token,
    input_column_name=eval_args.input_column_name,
    output_column_name=eval_args.output_column_name)

100%|██████████| 400/400 [00:11<00:00, 35.55it/s]

chars_per_token: 3.6613754326801393





In [48]:
print("Starting main loop")

training_args = TrainingArguments(
    output_dir=eval_args.output_dir,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    max_steps=eval_args.max_steps,
    eval_steps=eval_args.eval_freq,
    save_steps=eval_args.save_freq,
    logging_steps=eval_args.log_freq,
    per_device_train_batch_size=eval_args.batch_size,
    per_device_eval_batch_size=eval_args.batch_size,
    learning_rate=eval_args.learning_rate, 
    lr_scheduler_type=eval_args.lr_scheduler_type,
    warmup_steps=eval_args.num_warmup_steps,
    gradient_accumulation_steps=eval_args.gradient_accumulation_steps,
    gradient_checkpointing=not eval_args.no_gradient_checkpointing,
    fp16=not eval_args.no_fp16,
    bf16=eval_args.bf16,
    weight_decay=eval_args.weight_decay,
    run_name="StarCoder-3b-life2scenario-medium",
    report_to="wandb",
    ddp_find_unused_parameters=False,
)

trainer = Trainer(  model=model, args=training_args, 
                    train_dataset=None, eval_dataset=valid_dataset)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting main loop


In [49]:

print("Evaluating...")
metrics = trainer.evaluate()

max_eval_samples = eval_args.max_eval_samples if eval_args.max_eval_samples is not None else len(valid_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(valid_dataset))
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

Evaluating...


OutOfMemoryError: CUDA out of memory. Tried to allocate 344.00 MiB (GPU 0; 31.74 GiB total capacity; 28.90 GiB already allocated; 162.88 MiB free; 29.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF