In [1]:
!pip install datasets -q


In [2]:
from datasets import load_dataset

In [37]:
dataset_id = 'cognitivecomputations/dolphin-coder'
dataset = load_dataset(dataset_id,cache_dir='./data',split="train")

In [27]:
# We split the dataset into two where test data is used to evaluate at the end.
train_and_test_dataset = dataset.train_test_split(test_size=0.9, seed=0)

In [30]:
# Dumping the training data to a local file to be used for training.
train_and_test_dataset["train"].to_json("train.jsonl")
train_and_test_dataset["test"].select(range(10)).to_json("test.jsonl")

Creating json from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

31601

In [31]:
train_and_test_dataset["train"][0]

{'system_prompt': 'a chat',
 'question': 'Please explain the following problem in detail and provide code in Haskell:\nYou are given a 2D integer array `ranges` and two integers `left` and `right`. Each `ranges[i] = [starti, endi]` represents an **inclusive** interval between `starti` and `endi`.\n\nReturn `true` _if each integer in the inclusive range_ `[left, right]` _is covered by **at least one** interval in_ `ranges`. Return `false` _otherwise_.\n\nAn integer `x` is covered by an interval `ranges[i] = [starti, endi]` if `starti <= x <= endi`.\n\n**Example 1:**\n\n**Input:** ranges = \\[\\[1,2\\],\\[3,4\\],\\[5,6\\]\\], left = 2, right = 5\n**Output:** true\n**Explanation:** Every integer between 2 and 5 is covered:\n- 2 is covered by the first range.\n- 3 and 4 are covered by the second range.\n- 5 is covered by the third range.\n\n**Example 2:**\n\n**Input:** ranges = \\[\\[1,10\\],\\[10,20\\]\\], left = 21, right = 21\n**Output:** false\n**Explanation:** 21 is not covered by any

In [32]:
import json

template = {
    "prompt": """{system_prompt}

### Input:
{question}
""",
    "completion": " {response}",
}
with open("template.json", "w") as f:
    json.dump(template, f)

### Upload dataset to S3


In [33]:
from sagemaker.s3 import S3Uploader
import sagemaker
import random

output_bucket = sagemaker.Session().default_bucket()
local_data_file = "train.jsonl"
train_data_location = f"s3://{output_bucket}/dolphin_coder_dataset"
S3Uploader.upload(local_data_file, train_data_location)
S3Uploader.upload("template.json", train_data_location)
print(f"Training data: {train_data_location}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Training data: s3://sagemaker-us-east-1-434444145045/dolphin_coder_dataset


In [38]:
from ipywidgets import Dropdown
from sagemaker.jumpstart.notebook_utils import list_jumpstart_models


try:
    dropdown = Dropdown(
        options=list_jumpstart_models("search_keywords includes Text Generation"),
        value="meta-textgeneration-llama-codellama-7b",
        description="Select a JumpStart text generation model:",
        style={"description_width": "initial"},
        layout={"width": "max-content"},
    )
    display(dropdown)
except:
    dropdown = None
    pass

Dropdown(description='Select a JumpStart text generation model:', index=111, layout=Layout(width='max-content'â€¦

In [41]:
if dropdown:
    model_id = dropdown.value
else:
    model_id = "meta-textgeneration-llama-codellama-7b"
model_version = "*"

In [42]:
model_id

'meta-textgeneration-llama-codellama-7b'

In [43]:
from sagemaker import hyperparameters

my_hyperparameters = hyperparameters.retrieve_default(
    model_id=model_id, model_version=model_version
)

print(my_hyperparameters)

Using model 'meta-textgeneration-llama-codellama-7b' with wildcard version identifier '*'. You can pin to version '2.2.1' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


{'int8_quantization': 'False', 'enable_fsdp': 'True', 'epoch': '2', 'learning_rate': '0.0001', 'lora_r': '8', 'lora_alpha': '32', 'target_modules': 'q_proj,v_proj', 'lora_dropout': '0.05', 'instruction_tuned': 'True', 'chat_dataset': 'False', 'add_input_output_demarcation_key': 'True', 'per_device_train_batch_size': '2', 'per_device_eval_batch_size': '1', 'max_train_samples': '-1', 'max_val_samples': '-1', 'seed': '10', 'max_input_length': '2048', 'validation_split_ratio': '0.2', 'train_data_split_seed': '0', 'preprocessing_num_workers': 'None'}


In [44]:
my_hyperparameters["epoch"] = "1"
print(my_hyperparameters)

hyperparameters.validate(
    model_id=model_id, model_version=model_version, hyperparameters=my_hyperparameters
)

{'int8_quantization': 'False', 'enable_fsdp': 'True', 'epoch': '1', 'learning_rate': '0.0001', 'lora_r': '8', 'lora_alpha': '32', 'target_modules': 'q_proj,v_proj', 'lora_dropout': '0.05', 'instruction_tuned': 'True', 'chat_dataset': 'False', 'add_input_output_demarcation_key': 'True', 'per_device_train_batch_size': '2', 'per_device_eval_batch_size': '1', 'max_train_samples': '-1', 'max_val_samples': '-1', 'seed': '10', 'max_input_length': '2048', 'validation_split_ratio': '0.2', 'train_data_split_seed': '0', 'preprocessing_num_workers': 'None'}


In [45]:
from sagemaker.jumpstart.estimator import JumpStartEstimator

estimator = JumpStartEstimator(
    model_id=model_id,
    model_version=model_version,
    hyperparameters=my_hyperparameters,
    environment={
        "accept_eula": "true"
    },  # please change `accept_eula` to be `true` to accept EULA.
)



In [46]:
estimator.fit({"training": train_data_location})

INFO:sagemaker:Creating training-job with name: meta-textgeneration-llama-codellama-7b-2024-03-31-02-42-04-907


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.g5.12xlarge for training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.