In [54]:
!pip install sagemaker py7zr --upgrade --quiet
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q -U datasets bitsandbytes einops wandb

In [55]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


sagemaker role arn: arn:aws:iam::468983024660:role/service-role/AmazonSageMaker-ExecutionRole-20230709T073380
sagemaker bucket: sagemaker-us-east-1-468983024660
sagemaker session region: us-east-1


In [3]:
from datasets import load_dataset

dataset = load_dataset("Dahoas/full-hh-rlhf",split=[f"train{x}" for x in ["[0%:50%]","[51%:80%]","[81%:100%]"]])

Downloading metadata:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/478 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /home/ec2-user/.cache/huggingface/datasets/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/123M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/112052 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12451 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset[0].save_to_disk("sft_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/56026 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32495 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21290 [00:00<?, ? examples/s]

In [5]:
# upload files to s3 bucket
s3_sft_input = sess.upload_data(path="./sft_dataset",key_prefix="sft_model")

In [6]:
mkdir scripts

In [56]:
%%writefile scripts/train.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd 
import numpy as np
import os
import argparse
import numpy as np
from transformers import AutoTokenizer , AutoModelForCausalLM , TrainingArguments ,BitsAndBytesConfig , default_data_collator , Trainer
from transformers import get_linear_schedule_with_warmup 
from tqdm import tqdm
from peft import LoraConfig , get_peft_model , TaskType , prepare_model_for_int8_training , PeftConfig, PeftModel
from datasets import load_from_disk , Dataset , load_metric
import gc 


def create_peft_config(model):

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=[
        "query_key_value"
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ] 
    )

    # prepare int-8 model for training
    model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model



if __name__=='__main__':
    
    parse = argparse.ArgumentParser()
    ## hyperparameters 
    parse.add_argument("--epochs",default=1)
    parse.add_argument("--train_batch_size",default=2)
    parse.add_argument("--test_batch_size",default=2)
    parse.add_argument("--learning_rate",default=2e-4)
    parse.add_argument("--warmup_steps",default=500)
    parse.add_argument("--model_id",default="tiiuae/falcon-7b")
    ## sagemaker values 
    parse.add_argument("--output_data_dir",default=os.environ["SM_OUTPUT_DATA_DIR"])
    parse.add_argument("--model_dir",default=os.environ["SM_MODEL_DIR"])
    parse.add_argument("--n_gpus",default=os.environ["SM_NUM_GPUS"])
    parse.add_argument("--training_dir",default=os.environ["SM_CHANNEL_TRAIN"])
    
    args,_ = parse.parse_known_args()
    
    dataset = load_from_disk(args.training_dir)
    
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)

    model = AutoModelForCausalLM.from_pretrained(args.model_id,device_map="auto",load_in_8bit=True,trust_remote_code=True)
    
    model=create_peft_config(model)
    
    model.config.use_cache=False
    
    
    training_args = TrainingArguments(
        output_dir=f"{args.output_data_dir}",
        overwrite_output_dir=True,
        num_train_epochs=int(args.epochs), 
        logging_strategy="steps",
        logging_steps=50,
        warmup_steps =500,
        gradient_accumulation_steps=3,
        per_device_train_batch_size=int(args.train_batch_size),
        per_device_eval_batch_size=int(args.test_batch_size),
        save_strategy="no",
        bf16=True if torch.cuda.get_device_capability()[0] == 8 else False,
        logging_dir=f"{args.output_data_dir}/logs",
        learning_rate=float(args.learning_rate)
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=default_data_collator,
    )
       
    trainer.train()
    
    trainer.model.save_pretrained(args.model_dir) # save only adapter weights 
        

Overwriting scripts/train.py


In [16]:
from random import randint
from itertools import chain
from functools import partial

# custom instruct prompt start
prompt_template = f"You are an AI assitant to resolve Human queries query:\n{{prompt}}\n---\n response:\n{{response}}{{eos_token}}"


def template_dataset(sample):
    sample["text"] = prompt_template.format(prompt=sample["prompt"],
                                            response=sample["response"],
                                            eos_token=tokenizer.eos_token)
    return sample


# apply prompt template per sample
dataset = output.map(template_dataset, remove_columns=list(output.features))


remainder = {"input_ids": [], "attention_mask": []}


Map:   0%|          | 0/56026 [00:00<?, ? examples/s]

In [17]:
# chuks dataset into 2048 lengths 
def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"],return_token_type_ids=False),batched=True, remove_columns=list(dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

Map:   0%|          | 0/56026 [00:00<?, ? examples/s]

Map:   0%|          | 0/56026 [00:00<?, ? examples/s]

Total number of samples: 7327


In [50]:
training_input_path = f's3://{sess.default_bucket()}/falcon/train'
lm_dataset.save_to_disk(training_input_path)

In [57]:
%%writefile scripts/requirements.txt
transformers==4.28.0
accelerate==0.20.3
bitsandbytes==0.39.1
einops==0.6.1
peft==0.3.0
datasets==2.13.1
tensorboardX==2.6.1
torch==2.0.0

Overwriting scripts/requirements.txt


In [58]:
from sagemaker.pytorch import PyTorch
from sagemaker.huggingface import HuggingFace

import time 

job_name = f'Falcon-peft-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'


estimator = HuggingFace(
    entry_point="train.py",
    source_dir="./scripts",
    base_job_name=job_name,
    instance_type="ml.g5.2xlarge",
    instance_count=1,
    role=role,
    transformers_version="4.28",
    pytorch_version="2.0",
    py_version="py310",
    sagemaker_session=sess
)

In [None]:
estimator.fit({"train":training_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: Falcon-peft-2023-07-12-02-27-46-2023-07-12-02-27-47-429


Using provided s3_resource
2023-07-12 02:27:47 Starting - Starting the training job...
2023-07-12 02:28:02 Starting - Preparing the instances for training......
2023-07-12 02:29:08 Downloading - Downloading input data...
2023-07-12 02:29:38 Training - Downloading the training image..............................
2023-07-12 02:34:24 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-07-12 02:34:59,371 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-07-12 02:34:59,383 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-07-12 02:34:59,391 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-07-12 02:34:59,393 sagemaker_pytorch_container.training INFO     Invoking user training sc

In [62]:
! aws s3 cp s3://sagemaker-us-east-1-468983024660/Falcon-peft-2023-07-12-02-27-46-2023-07-12-02-27-47-429/output/model.tar.gz     ./

download: s3://sagemaker-us-east-1-468983024660/Falcon-peft-2023-07-12-02-27-46-2023-07-12-02-27-47-429/output/model.tar.gz to ./model.tar.gz


In [12]:
from transformers import AutoTokenizer , AutoModelForCausalLM

In [63]:
import tarfile 
with tarfile.open("./model.tar.gz","r") as f:
    f.extractall("./model")


In [105]:
%%writefile model/code/inference.py

# lets deploy the falcon 7B model for inference 
import torch
import torch.nn as nn
from transformers import AutoTokenizer , AutoModelForCausalLM
import os 
import json
from peft import LoraConfig , get_peft_model , TaskType , prepare_model_for_int8_training , PeftConfig, PeftModel


def model_fn(model_dir):
    
    peft_config = PeftConfig.from_pretrained(model_dir)
    
    model = AutoModelForCausalLM.from_pretrained(
        peft_config.base_model_name_or_path,
        return_dict=True,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    
    model = PeftModel.from_pretrained(model, model_dir)
    
    model = model.merge_and_unload() # merge the falcon and trained adapter weights 
    
    tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
    
    tokenizer.pad_token = tokenizer.eos_token
    
    return model , tokenizer 
        

def predict_fn(data, model):
    
    model , tokenizer = model 
    
    model.eval()
    
    inputs = data.pop("inputs", data)
    
    parameters = data.pop("parameters", None)
    
    input_ids = tokenizer(inputs,return_tensors='pt',return_token_type_ids=False)
    
    model.to("cuda")
    
    input_ids.to("cuda")
    
    output = model.generate(**input_ids,**parameters)
    
    generated_text=tokenizer.decode(output[0],skip_special_tokens=True)
    
    return {"generated_text":generated_text[len(inputs):]}
  

Overwriting model/code/inference.py


In [106]:
%%writefile model/code/requirements.txt
transformers==4.30
einops==0.6.1
accelerate==0.20.3
bitsandbytes==0.39.1
peft==0.3.0
datasets==2.13.1

Overwriting model/code/requirements.txt


In [107]:
import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str('./model')) 

adapter_model.bin
code
adapter_config.json
.ipynb_checkpoints


In [108]:
s3_input = sess.upload_data(path='./model.tar.gz', bucket=sagemaker_session_bucket, key_prefix='falconinference') # upload the tar file to s3 bucke

In [109]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")

In [110]:
from sagemaker.huggingface import HuggingFaceModel

# created a torch model 
model=HuggingFaceModel(
model_data=s3_input,
py_version='py310',
sagemaker_session=sess,
transformers_version='4.28',
pytorch_version='2.0',
role=role
)

In [111]:
predictor=model.deploy(initial_instance_count=1,instance_type='ml.g5.8xlarge')

INFO:sagemaker:Creating model with name: huggingface-pytorch-inference-2023-07-12-11-55-13-601
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-inference-2023-07-12-11-55-14-268
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-inference-2023-07-12-11-55-14-268


-------------!

In [430]:
prompt=""""You are an AI assitant to help humans with queries 

{} \n---\n response:\n
        
""".format(" Human: Check to see what appointments I have today Assistant: Current schedule: 9:30 a.m. massage with James Dunlap; 11:00 a.m. dentist appointment with Dr. Denesee; 12:30 p.m. lunch; 3:30 p.m. meeting with the CEO of World Trade Group; 6:00 p.m. dinner with colleagues Human: can you call and cancel my 9:30 appointment Assistant:")

In [431]:
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.3,
    "top_k":150,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.2,
     "eos_token_id":tokenizer.eos_token_id
  }
}

In [432]:
predictor.predict(payload)

{'generated_text': 'I’m sorry, but it is not possible for me to make phone calls on your behalf.'}