In [1]:
import os
from os import path
import pandas as pd
import numpy as np
import glob

In [2]:
import torch
from finetune import create_datasets, ConstantLengthDataset, chars_token_ratio, run_training
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /mnt/home/yucedago/miniconda3/envs/llm/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /mnt/home/yucedago/miniconda3/envs/llm/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda116.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [3]:
from datasets import load_dataset
from datasets import Dataset

# Definitions

In [4]:
HOME=os.path.expanduser('~')
LIFE2SCENARIO_ROOT_PATH=path.join(HOME,"Documents/life2scenario/")
DATASET_ROOT_PATH=path.join(LIFE2SCENARIO_ROOT_PATH,"life2scenario_minimal/dataset/train/")

print(DATASET_ROOT_PATH)

/mnt/home/yucedago/Documents/life2scenario/life2scenario_minimal/dataset/train/


In [5]:
PROMPTS_ROOT=path.join(DATASET_ROOT_PATH, "prompts")
REFERENCE_ROOT=path.join(DATASET_ROOT_PATH, "ref_scenarios")
TARGET_ROOT=path.join(DATASET_ROOT_PATH, "target_scenarios")

In [6]:
PREP_PICKLES_ROOT=path.join(LIFE2SCENARIO_ROOT_PATH, "prep_pickles")

## Load Train DataFrame

In [7]:
train_final = pd.read_csv(path.join(PREP_PICKLES_ROOT, "train_dataset.csv"))

train_final = train_final[["request", "response"]]
train_final.head()

Unnamed: 0,request,response
0,would you add pedestrian close to hero?\n```\n...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
1,i would like to command you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
2,i would like to request you to add pedestrian ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
3,i would like to tell you to add pedestrian at ...,"Here is the result:\n```\n<?xml version=""1.0"" ..."
4,add pedestrian close to hero?\n```\n<?xml vers...,"Here is the result:\n```\n<?xml version=""1.0"" ..."


## Create Dataset

In [8]:
life2scenario_dataset = Dataset.from_pandas(train_final)
l2s_dataset = life2scenario_dataset.train_test_split(test_size=0.1)
l2s_dataset

DatasetDict({
    train: Dataset({
        features: ['request', 'response'],
        num_rows: 20072
    })
    test: Dataset({
        features: ['request', 'response'],
        num_rows: 2231
    })
})

In [16]:
class Dict2Obj(object):
  def __init__(self, dictionary):
    for key in dictionary:
        setattr(self, key, dictionary[key])
  
  def __repr__(self):
    return "<dict2obj: %s>" % self.__dict__

# Training Params
train_dict = {
    "model_path": "bigcode/starcoderbase-1b",
    "subset": "data/finetune",
    "streaming": True,
    "seq_length": 8000,
    "max_steps": 1000,
    "batch_size": 1,
    "input_column_name": "request",
    "output_column_name": "response",
    "gradient_accumulation_steps": 16,
    "learning_rate": 1e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 100,
    "weight_decay": 0.05,
    "output_dir": "./checkpoints",

    "local_rank": 0,
    "eos_token_id": 49152,
    "no_gradient_checkpointing": False,
    "shuffle_buffer": 5000,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "no_fp16": False,
    "bf16":False,
    "seed": 0,
    "num_workers": None,
    "log_freq": 100,
    "eval_freq":100,
    "save_freq": 1000
  }

train_args = Dict2Obj(train_dict)
train_args

<dict2obj: {'model_path': 'bigcode/starcoderbase-1b', 'subset': 'data/finetune', 'streaming': True, 'seq_length': 8000, 'max_steps': 1000, 'batch_size': 1, 'input_column_name': 'request', 'output_column_name': 'response', 'gradient_accumulation_steps': 16, 'learning_rate': 0.0001, 'lr_scheduler_type': 'cosine', 'num_warmup_steps': 100, 'weight_decay': 0.05, 'output_dir': './checkpoints', 'local_rank': 0, 'eos_token_id': 49152, 'no_gradient_checkpointing': False, 'shuffle_buffer': 5000, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'no_fp16': False, 'bf16': False, 'seed': 0, 'num_workers': None, 'log_freq': 100, 'eval_freq': 100, 'save_freq': 1000}>

In [17]:
checkpoint = "bigcode/starcoderbase-1b"
device = "cuda" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# to save memory consider using fp16 or bf16 by specifying torch_dtype=torch.float16 for example
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16).to(device)

In [18]:
model.num_parameters()

1137207296

In [24]:
train_data = l2s_dataset["train"]
test_data = l2s_dataset["test"]


chars_per_token = chars_token_ratio(train_data, tokenizer, train_args.input_column_name, train_args.output_column_name)
print(f"chars_per_token: {chars_per_token}")

train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    infinite=True,
    seq_length=train_args.seq_length,
    chars_per_token=chars_per_token,
    input_column_name=train_args.input_column_name,
    output_column_name=train_args.output_column_name
)

valid_dataset = ConstantLengthDataset(
    tokenizer,
    test_data,
    infinite=False,
    seq_length=train_args.seq_length,
    chars_per_token=chars_per_token,
    input_column_name=train_args.input_column_name,
    output_column_name=train_args.output_column_name
)

100%|██████████| 400/400 [00:07<00:00, 52.69it/s]

chars_per_token: 3.673552732407561





In [22]:
run_training(train_args, train_data, test_data)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading the model


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


trainable params: 7176192 || all params: 1144383488 || trainable%: 0.6270793029827428
Starting main loop
Training...


IndexError: list index out of range