In [2]:
# restart kernel for packages to take effect
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import boto3 
from datasets import load_dataset
import random
import jsonlines
from dotenv import load_dotenv

load_dotenv()


True

In [7]:
dataset = load_dataset("cnn_dailymail",'3.0.0')

Downloading readme: 100%|██████████| 15.6k/15.6k [00:00<00:00, 15.6MB/s]
Downloading data: 100%|██████████| 257M/257M [00:22<00:00, 11.4MB/s]
Downloading data: 100%|██████████| 257M/257M [00:22<00:00, 11.4MB/s]
Downloading data: 100%|██████████| 259M/259M [00:22<00:00, 11.5MB/s]
Downloading data: 100%|██████████| 34.7M/34.7M [00:03<00:00, 10.3MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:03<00:00, 9.07MB/s]
Downloading data files: 100%|██████████| 3/3 [01:14<00:00, 24.75s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 76.65it/s]
Generating train split: 100%|██████████| 287113/287113 [00:01<00:00, 158826.50 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 164956.26 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 154847.45 examples/s]


In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [9]:
instruction='''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

instruction:

Summarize the news article provided below.

input:

'''

In [10]:
datapoints_train=[]
for dp in dataset['train']:
    temp_dict={}
    temp_dict['prompt']=instruction+dp['article']
    temp_dict['completion']='response:\n\n'+dp['highlights']
    datapoints_train.append(temp_dict)

In [11]:
print(datapoints_train[4]['prompt'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

instruction:

Summarize the news article provided below.

input:

(CNN)  -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appear in court Monday. A judge will have the final say on a plea deal. Earlier, Vick admitted to participating in a dogfighting ring as part of a plea agreement with federal prosecutors in Virginia. "Your admitted conduct was not only illegal, but also cruel and reprehensible. Your team, the NFL, and NFL fans have all been hurt by your actions," NFL Commissioner Roger Goodell said in a letter to Vick. Goodell said he would review the status of the suspension after the legal proceedings are over. In papers filed Friday with a federal court in Virginia, Vick also admitted that h

In [12]:
datapoints_valid=[]
for dp in dataset['validation']:
    temp_dict={}
    temp_dict['prompt']=instruction+dp['article']
    temp_dict['completion']='response:\n\n'+dp['highlights']
    datapoints_valid.append(temp_dict)
    
datapoints_test=[]
for dp in dataset['test']:
    temp_dict={}
    temp_dict['prompt']=instruction+dp['article']
    temp_dict['completion']='response:\n\n'+dp['highlights']
    datapoints_test.append(temp_dict)

In [13]:
def dp_transform(data_points,num_dps,max_dp_length):
    lines=[]
    for dp in data_points:
        if len(dp['prompt']+dp['completion'])<=max_dp_length:
                lines.append(dp)
    random.shuffle(lines)
    lines=lines[:num_dps]
    return lines

In [14]:
def jsonl_converter(dataset,file_name):
    print(file_name)
    with jsonlines.open(file_name, 'w') as writer:
        for line in dataset:
            writer.write(line)

In [15]:
train=dp_transform(datapoints_train,5000,3000)
validation=dp_transform(datapoints_valid,999,3000)
test=dp_transform(datapoints_test,10,3000)

In [16]:
dataset_folder="fine-tuning-datasets"
train_file_name="train-cnn-5K.jsonl"
validation_file_name="validation-cnn-1K.jsonl"
test_file_name="test-cnn-10.jsonl"
!mkdir fine-tuning-datasets
abs_path=os.path.abspath(dataset_folder)

In [17]:
jsonl_converter(train,f'{abs_path}/{train_file_name}')
jsonl_converter(validation,f'{abs_path}/{validation_file_name}')
jsonl_converter(test,f'{abs_path}/{test_file_name}')

c:\repos\potential-meme\cookbook-aws\fine-tuning-datasets/train-cnn-5K.jsonl
c:\repos\potential-meme\cookbook-aws\fine-tuning-datasets/validation-cnn-1K.jsonl
c:\repos\potential-meme\cookbook-aws\fine-tuning-datasets/test-cnn-10.jsonl


In [27]:
sts_client = boto3.client('sts')
s3_client = boto3.client('s3')
account_id = sts_client.get_caller_identity()["Account"]
region = s3_client.meta.region_name
s3_suffix = f"{region}-{account_id}"
bucket_name = f"bedrock-customization-{s3_suffix}"

In [30]:
s3bucket = s3_client.create_bucket(
    Bucket=bucket_name,
    ## Uncomment the following if you run into errors
    # CreateBucketConfiguration={
    #     'LocationConstraint':region,
    # },
)

In [31]:
s3_client.upload_file(f'{abs_path}/{train_file_name}', bucket_name, f'fine-tuning-datasets/train/{train_file_name}')
s3_client.upload_file(f'{abs_path}/{validation_file_name}', bucket_name, f'fine-tuning-datasets/validation/{validation_file_name}')
s3_client.upload_file(f'{abs_path}/{test_file_name}', bucket_name, f'fine-tuning-datasets/test/{test_file_name}')

In [34]:
s3_train_uri=f's3://{bucket_name}/fine-tuning-datasets/train/{train_file_name}'
s3_validation_uri=f's3://{bucket_name}/fine-tuning-datasets/validation/{validation_file_name}'
s3_test_uri=f's3://{bucket_name}/fine-tuning-datasets/test/{test_file_name}'

In [None]:
# # install the fmeval package for foundation model evaluation
!rm -Rf ~/.cache/pip/*
!pip install tokenizers==0.12.1
!pip install -qU fmeval==0.3.0

In [35]:
import pprint
# pprint.pp(role_arn)
pprint.pp(s3_train_uri)
pprint.pp(s3_validation_uri)
pprint.pp(s3_test_uri)
pprint.pp(bucket_name)

's3://bedrock-customization-us-east-1-975050265322/fine-tuning-datasets/train/train-cnn-5K.jsonl'
's3://bedrock-customization-us-east-1-975050265322/fine-tuning-datasets/validation/validation-cnn-1K.jsonl'
's3://bedrock-customization-us-east-1-975050265322/fine-tuning-datasets/test/test-cnn-10.jsonl'
'bedrock-customization-us-east-1-975050265322'


In [36]:
test_file_name = "test-cnn-10.jsonl"
data_folder = "fine-tuning-datasets"

In [40]:
bedrock = boto3.client(service_name="bedrock", region_name=s3_client.meta.region_name)

In [41]:
bedrock.meta.region_name

'us-east-1'

In [46]:
ROLE_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Principal": {{
                "Service": "bedrock.amazonaws.com"
            }},
            "Action": "sts:AssumeRole",
            "Condition": {{
                "StringEquals": {{
                    "aws:SourceAccount": "{account_id}"
                }},
                "ArnEquals": {{
                    "aws:SourceArn": "arn:aws:bedrock:{region}:{account_id}:model-customization-job/*"
                }}
            }}
        }}
    ]
}}
"""

ACCESS_POLICY_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "s3:AbortMultipartUpload",
                "s3:DeleteObject",
                "s3:PutObject",
                "s3:GetObject",
                "s3:GetBucketAcl",
                "s3:GetBucketNotification",
                "s3:ListBucket",
                "s3:PutBucketNotification"
            ],
            "Resource": [
                "arn:aws:s3:::{bucket_name}",
                "arn:aws:s3:::{bucket_name}/*"
            ]
        }}
    ]
}}"""

iam = boto3.client('iam', region_name=region)
role_name = "AmazonBedrockCustomizationRole1"
s3_bedrock_finetuning_access_policy="AmazonBedrockCustomizationPolicy1"
customization_role = f"arn:aws:iam::{account_id}:role/{role_name}"
response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=ROLE_DOC,
    Description="Role for Bedrock to access S3 for finetuning",
)
pprint.pp(response)


{'Role': {'Path': '/',
          'RoleName': 'AmazonBedrockCustomizationRole1',
          'RoleId': 'AROA6GBMGKLVOP5IHPRVA',
          'Arn': 'arn:aws:iam::975050265322:role/AmazonBedrockCustomizationRole1',
          'CreateDate': datetime.datetime(2025, 5, 19, 20, 32, 38, tzinfo=tzutc()),
          'AssumeRolePolicyDocument': {'Version': '2012-10-17',
                                       'Statement': [{'Effect': 'Allow',
                                                      'Principal': {'Service': 'bedrock.amazonaws.com'},
                                                      'Action': 'sts:AssumeRole',
                                                      'Condition': {'StringEquals': {'aws:SourceAccount': '975050265322'},
                                                                    'ArnEquals': {'aws:SourceArn': 'arn:aws:bedrock:us-east-1:975050265322:model-customization-job/*'}}}]}},
 'ResponseMetadata': {'RequestId': 'a6162493-4010-48e9-9a71-a0bca72c3cc8',
             

In [47]:
role_arn = response["Role"]["Arn"]
pprint.pp(role_arn)

'arn:aws:iam::975050265322:role/AmazonBedrockCustomizationRole1'


In [51]:
response = iam.create_policy(
    PolicyName=s3_bedrock_finetuning_access_policy,
    PolicyDocument=ACCESS_POLICY_DOC,
)
pprint.pp(response)

{'Policy': {'PolicyName': 'AmazonBedrockCustomizationPolicy1',
            'PolicyId': 'ANPA6GBMGKLVIUDUSCQF7',
            'Arn': 'arn:aws:iam::975050265322:policy/AmazonBedrockCustomizationPolicy1',
            'Path': '/',
            'DefaultVersionId': 'v1',
            'AttachmentCount': 0,
            'PermissionsBoundaryUsageCount': 0,
            'IsAttachable': True,
            'CreateDate': datetime.datetime(2025, 5, 19, 20, 46, 21, tzinfo=tzutc()),
            'UpdateDate': datetime.datetime(2025, 5, 19, 20, 46, 21, tzinfo=tzutc())},
 'ResponseMetadata': {'RequestId': 'deaf5e04-11f2-4bec-a1ff-4e6bfc746c31',
                      'HTTPStatusCode': 200,
                      'HTTPHeaders': {'date': 'Mon, 19 May 2025 20:46:21 GMT',
                                      'x-amzn-requestid': 'deaf5e04-11f2-4bec-a1ff-4e6bfc746c31',
                                      'content-type': 'text/xml',
                                      'content-length': '801'},
                    

In [52]:
policy_arn = response["Policy"]["Arn"]
pprint.pp(policy_arn)

iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_arn,
)

'arn:aws:iam::975050265322:policy/AmazonBedrockCustomizationPolicy1'


{'ResponseMetadata': {'RequestId': '70d86ae1-496b-4e01-8a60-7856e65f5e77',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 19 May 2025 20:46:39 GMT',
   'x-amzn-requestid': '70d86ae1-496b-4e01-8a60-7856e65f5e77',
   'content-type': 'text/xml',
   'content-length': '212'},
  'RetryAttempts': 0}}

In [None]:
from datetime import datetime
ts = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")


# Choose the foundation model you want to customize and provide ModelId(find more about model reference at https://docs.aws.amazon.com/bedrock/latest/userguide/bedrock-reference.html)
base_model_id = "arn:aws:bedrock:us-east-1::foundation-model/cohere.command-light-text-v14:7:4k"

# Select the customization type from "FINE_TUNING" or "CONTINUED_PRE_TRAINING". 
customization_type = "FINE_TUNING"

# Specify the roleArn for your customization job
# customization_role = role_arn

# Create a customization job name
customization_job_name = f"cohere-light-finetune-sm-test-model-{ts}"

# Create a customized model name for your fine-tuned Llama2 model
custom_model_name = f"cohere-light-finetune-{ts}"

# Define the hyperparameters for fine-tuning Llama3.1 8b model
hyper_parameters = {
        "epochCount": "1",
        "batchSize": "8",
        "learningRate": "0.00005",
    }

# Specify your data path for training, validation(optional) and output
training_data_config = {"s3Uri": s3_train_uri}

# # uncomment the below section if you have validation dataset and provide the s3 uri for it. 
validation_data_config = {
        "validators": [{
            "s3Uri": s3_validation_uri
        }]
    }

output_data_config = {"s3Uri": f's3://{bucket_name}/outputs/output-{custom_model_name}'}

# # Create the customization job
bedrock.create_model_customization_job(
    customizationType=customization_type,
    jobName=customization_job_name,
    customModelName=custom_model_name,
    roleArn=customization_role,
    baseModelIdentifier=base_model_id,
    hyperParameters=hyper_parameters,
    trainingDataConfig=training_data_config,
    validationDataConfig=validation_data_config,
    outputDataConfig=output_data_config
)

{'ResponseMetadata': {'RequestId': 'aab19a4b-8bb0-4504-b61d-25af5622f74c',
  'HTTPStatusCode': 201,
  'HTTPHeaders': {'date': 'Mon, 19 May 2025 21:20:02 GMT',
   'content-type': 'application/json',
   'content-length': '123',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'aab19a4b-8bb0-4504-b61d-25af5622f74c'},
  'RetryAttempts': 0},
 'jobArn': 'arn:aws:bedrock:us-east-1:975050265322:model-customization-job/cohere.command-light-text-v14:7:4k/o7wdsiguuhnj'}

In [71]:
fine_tune_job = bedrock.get_model_customization_job(jobIdentifier=customization_job_name)["status"]
print(fine_tune_job)


InProgress
