# Dolly v2 SageMaker Finetuning

This is a sample code to finetune and deploy Dolly v2 with LoRA on SageMaker.

In [None]:
!pip install -U "sagemaker>=2.143.0"

In [None]:
import sagemaker, boto3, json
from sagemaker import get_execution_role
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.huggingface import HuggingFace

role = get_execution_role()
region = boto3.Session().region_name
sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.__version__

## Upload Data

We will use Databricks-dolly-15k as sample dataset to finetune the model. (License: [Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/legalcode))

You may also choose to use custom dataset.

In [None]:
#!curl https://raw.githubusercontent.com/databrickslabs/dolly/master/data/databricks-dolly-15k.jsonl --create-dirs -o data/databricks-dolly-15k.jsonl

In [None]:
# Convet .jsonl to .json
import pandas as pd

In [None]:
df = pd.read_json('../dataset_dolly2.0/databricks-dolly-15k.jsonl', orient='records', lines=True)

In [None]:
df

In [None]:
df = df.rename(columns={"context": "input", "response": "output"})
df.to_json("data/databricks-dolly-15k.json", orient='records')

In [None]:
df

In [None]:
input_train = sess.upload_data(
    path="./data/databricks-dolly-15k.json",
    key_prefix="Dolly"
)
input_train

## Japanese text

In [None]:
df_ja = pd.read_json('../dataset_dolly2.0/output_all/databricks-dolly-15k-ja.jsonl', orient='records', lines=True)

In [None]:
df_ja

In [None]:
df_ja = df_ja.rename(columns={"context": "input", "response": "output"})
df_ja.drop('row', axis=1, inplace=True) ### row列は不要
df_ja.to_json("data/databricks-dolly-15k-ja.json", orient='records')

In [None]:
df_ja

In [None]:
input_train_ja = sess.upload_data(
    path="./data/databricks-dolly-15k-ja.json",
    key_prefix="Dolly"
)
input_train_ja

## Fine-tuning

Fine-tuning took approximately 4 hours for 1 epoch on p3.2xlarge.

In [None]:
hyperparameters={
    'base_model':'databricks/dolly-v2-3b',
    'load_in_8bit': False,
    #'data_path': '/opt/ml/input/data/train/databricks-dolly-15k.json',
    'data_path': '/opt/ml/input/data/train/databricks-dolly-15k-ja.json',
    'num_epochs': 3, # default 3
    'cutoff_len': 512,
    'group_by_length': True,
    'output_dir': '/opt/ml/model',
    'lora_target_modules': '[query_key_value]',
    'lora_r': 16,
    'batch_size': 32,
    'micro_batch_size': 4,
    'prompt_template_name': 'alpaca',
}

In [None]:
huggingface_estimator = HuggingFace(
    base_job_name="Dolly-v2",
    role=role,
    entry_point='finetune.py',
    source_dir='./scripts/code',
    #instance_type='ml.g5.2xlarge',
    instance_type='ml.p4d.24xlarge',
    instance_count=1,
    volume_size=200,
    transformers_version='4.26',
    pytorch_version='1.13',
    py_version='py39',
    #use_spot_instances=True, # for spot training
    #max_wait=86400, # for spot training
    hyperparameters=hyperparameters,
)
#huggingface_estimator.fit({'train': input_train})
huggingface_estimator.fit({'train': input_train_ja})

In [None]:
huggingface_estimator.output_path

In [None]:
huggingface_estimator.model_data

## Download and Extract Model

In [None]:
!aws s3 cp $huggingface_estimator.model_data model.tar.gz

In [None]:
!rm -rf scripts/model && mkdir scripts/model
!tar -xvf model.tar.gz -C scripts/model --no-same-owner --wildcards adapter_*

## Package and Upload Model

In [None]:
%cd scripts
!tar -czvf ../package.tar.gz *
%cd -

In [None]:
model_path = sess.upload_data('package.tar.gz', bucket=bucket, key_prefix=f"Dolly-v2")
model_path

## Deploy Model

In [None]:
from sagemaker.async_inference import AsyncInferenceConfig
from sagemaker.serializers import JSONSerializer

endpoint_name = "Dolly-v2"

huggingface_model = PyTorchModel(
    model_data=model_path,
    framework_version="1.13",
    py_version='py39',
    role=role,
    name=endpoint_name,
    env={
        "model_params": json.dumps({
            "base_model": "databricks/dolly-v2-3b",
            "lora_weights": "model", # path relative to model package
            "peft": True,
            "load_8bit": True,
            "prompt_template": "alpaca",
        })
    }
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type='ml.g5.2xlarge',
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    async_inference_config=AsyncInferenceConfig()
)

## Run Inference

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.predictor_async import AsyncPredictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import NumpyDeserializer

predictor_client = AsyncPredictor(
    predictor=Predictor(
        endpoint_name=endpoint_name,
        sagemaker_session=sess,
        serializer=JSONSerializer(),
        deserializer=NumpyDeserializer()
    ),
    name=endpoint_name
)

In [None]:
data = {
    "instruction": "When was George Washington president?",
    "input": """George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman,
and Founding Father who served as the first president of the United States from 1789 to 1797.
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
data = {
    "instruction": "ジョージ・ワシントンが大統領になったのはいつですか？",
    "input": """ジョージ・ワシントン（George Washington、1732年2月22日[b] - 1799年12月14日）は、アメリカの軍人、政治家である、
と、1789年から1797年までアメリカ合衆国の初代大統領を務めた建国の父。
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
data = {
    "instruction": "",
    "input": """日本で一番高い山はなんですか？また、その高さは？
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
data = {
    "instruction": "以下の質問に、日本語で答えてください。",
    "input": """日本で一番高い山はなんですか？また、その高さは？
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
data = {
    "instruction": "",
    "input": """What is the highest mountain in Japan? How tall it is?
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
data = {
    "instruction": "以下の質問に、日本語で答えてください。",
    "input": """子供を寝かしつける最適な方法は？
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
data = {
    "instruction": "以下の質問に、日本語で答えてください。",
    "input": """pandas DataFrame に CSVファイルを読み込ませたい。
""",
    "max_new_tokens": 64,
    "temperature": 0.7,
    "do_sample": True,
    "stop_ids": [50278, 50279, 50277, 1, 0],
}
response = predictor_client.predict(
    data=data
)
print(response)

## Delete Endpoint

In [None]:
predictor.delete_model()
predictor.delete_endpoint()