In [2]:
!pip install --upgrade pip
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"
!pip install -qU --force-reinstall langchain typing_extensions pypdf urllib3==2.1.0
!pip install -qU ipywidgets>=7,<8
!pip install jsonlines
!pip install datasets==2.15.0
!pip install pandas==2.1.3
!pip install matplotlib==3.8.2

[0mCollecting boto3>=1.28.57
  Using cached boto3-1.34.84-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli>=1.29.57
  Using cached awscli-1.32.84-py3-none-any.whl.metadata (11 kB)
Collecting botocore>=1.31.57
  Using cached botocore-1.34.84-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.28.57)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3>=1.28.57)
  Using cached s3transfer-0.10.1-py3-none-any.whl.metadata (1.7 kB)
Collecting docutils<0.17,>=0.10 (from awscli>=1.29.57)
  Using cached docutils-0.16-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting PyYAML<6.1,>=3.10 (from awscli>=1.29.57)
  Using cached PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting colorama<0.4.5,>=0.2.5 (from awscli>=1.29.57)
  Using cached colorama-0.4.4-py2.py3-none-any.whl.metadata (14 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli>=1.29.57)
  Using cached 

In [None]:
# restart kernel for packages to take effect
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [1]:
import warnings
warnings.filterwarnings('ignore')
import json
import os
import sys
import boto3 
import time
import pprint
from datasets import load_dataset
import random
import jsonlines

In [2]:
session = boto3.session.Session()
region = session.region_name
sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()["Account"]
s3_suffix = f"{region}-{account_id}"
bucket_name = f"bedrock-customization-{s3_suffix}"
s3_client = boto3.client('s3')
bedrock = boto3.client(service_name="bedrock")
bedrock_runtime = boto3.client(service_name="bedrock-runtime")
iam = boto3.client('iam', region_name=region)

In [3]:
import uuid
suffix = str(uuid.uuid4())
role_name = "BedrockRole-" + suffix
s3_bedrock_finetuning_access_policy="BedrockPolicy-" + suffix
customization_role = f"arn:aws:iam::{account_id}:role/{role_name}"

In [4]:
for model in bedrock.list_foundation_models(
    byCustomizationType="FINE_TUNING")["modelSummaries"]:
    for key, value in model.items():
        print(key, ":", value)
    print("-----\n")

modelArn : arn:aws:bedrock:us-west-2::foundation-model/amazon.titan-text-lite-v1:0:4k
modelId : amazon.titan-text-lite-v1:0:4k
modelName : Titan Text G1 - Lite
providerName : Amazon
inputModalities : ['TEXT']
outputModalities : ['TEXT']
responseStreamingSupported : True
customizationsSupported : ['FINE_TUNING', 'CONTINUED_PRE_TRAINING']
inferenceTypesSupported : ['PROVISIONED']
modelLifecycle : {'status': 'ACTIVE'}
-----

modelArn : arn:aws:bedrock:us-west-2::foundation-model/amazon.titan-text-express-v1:0:8k
modelId : amazon.titan-text-express-v1:0:8k
modelName : Titan Text G1 - Express
providerName : Amazon
inputModalities : ['TEXT']
outputModalities : ['TEXT']
responseStreamingSupported : True
customizationsSupported : ['FINE_TUNING', 'CONTINUED_PRE_TRAINING']
inferenceTypesSupported : ['PROVISIONED']
modelLifecycle : {'status': 'ACTIVE'}
-----

modelArn : arn:aws:bedrock:us-west-2::foundation-model/amazon.titan-embed-image-v1:0
modelId : amazon.titan-embed-image-v1:0
modelName : Ti

In [5]:
# Create S3 bucket for knowledge base data source
s3bucket = s3_client.create_bucket(
    Bucket=bucket_name,
     CreateBucketConfiguration={
         'LocationConstraint':region,
     },
)

BucketAlreadyOwnedByYou: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.

In [6]:
ROLE_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Principal": {{
                "Service": "bedrock.amazonaws.com"
            }},
            "Action": "sts:AssumeRole",
            "Condition": {{
                "StringEquals": {{
                    "aws:SourceAccount": "{account_id}"
                }},
                "ArnEquals": {{
                    "aws:SourceArn": "arn:aws:bedrock:{region}:{account_id}:model-customization-job/*"
                }}
            }}
        }}
    ]
}}
"""

In [7]:
ACCESS_POLICY_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "s3:AbortMultipartUpload",
                "s3:DeleteObject",
                "s3:PutObject",
                "s3:GetObject",
                "s3:GetBucketAcl",
                "s3:GetBucketNotification",
                "s3:ListBucket",
                "s3:PutBucketNotification"
            ],
            "Resource": [
                "arn:aws:s3:::{bucket_name}",
                "arn:aws:s3:::{bucket_name}/*"
            ]
        }}
    ]
}}"""


In [8]:
response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=ROLE_DOC,
    Description="Role for Bedrock to access S3 for finetuning",
)
pprint.pp(response)

{'Role': {'Path': '/',
          'RoleName': 'BedrockRole-1b2c26de-a726-48f5-8909-a7f92501cb8f',
          'RoleId': 'AROAXMGYQR44CAU64UXTI',
          'Arn': 'arn:aws:iam::507260211000:role/BedrockRole-1b2c26de-a726-48f5-8909-a7f92501cb8f',
          'CreateDate': datetime.datetime(2024, 4, 16, 12, 59, 42, tzinfo=tzlocal()),
          'AssumeRolePolicyDocument': {'Version': '2012-10-17',
                                       'Statement': [{'Effect': 'Allow',
                                                      'Principal': {'Service': 'bedrock.amazonaws.com'},
                                                      'Action': 'sts:AssumeRole',
                                                      'Condition': {'StringEquals': {'aws:SourceAccount': '507260211000'},
                                                                    'ArnEquals': {'aws:SourceArn': 'arn:aws:bedrock:us-west-2:507260211000:model-customization-job/*'}}}]}},
 'ResponseMetadata': {'RequestId': '3a5de958-6c6c-40

In [9]:
role_arn = response["Role"]["Arn"]
pprint.pp(role_arn)

'arn:aws:iam::507260211000:role/BedrockRole-1b2c26de-a726-48f5-8909-a7f92501cb8f'


In [10]:
response = iam.create_policy(
    PolicyName=s3_bedrock_finetuning_access_policy,
    PolicyDocument=ACCESS_POLICY_DOC,
)
pprint.pp(response)

{'Policy': {'PolicyName': 'BedrockPolicy-1b2c26de-a726-48f5-8909-a7f92501cb8f',
            'PolicyId': 'ANPAXMGYQR44FQ4OM2HYR',
            'Arn': 'arn:aws:iam::507260211000:policy/BedrockPolicy-1b2c26de-a726-48f5-8909-a7f92501cb8f',
            'Path': '/',
            'DefaultVersionId': 'v1',
            'AttachmentCount': 0,
            'PermissionsBoundaryUsageCount': 0,
            'IsAttachable': True,
            'CreateDate': datetime.datetime(2024, 4, 16, 12, 59, 45, tzinfo=tzlocal()),
            'UpdateDate': datetime.datetime(2024, 4, 16, 12, 59, 45, tzinfo=tzlocal())},
 'ResponseMetadata': {'RequestId': '1a17375d-735b-4d0a-be80-cb1e9e6c6b46',
                      'HTTPStatusCode': 200,
                      'HTTPHeaders': {'date': 'Tue, 16 Apr 2024 12:59:44 GMT',
                                      'x-amzn-requestid': '1a17375d-735b-4d0a-be80-cb1e9e6c6b46',
                                      'content-type': 'text/xml',
                                      'content

In [11]:
policy_arn = response["Policy"]["Arn"]
pprint.pp(policy_arn)

'arn:aws:iam::507260211000:policy/BedrockPolicy-1b2c26de-a726-48f5-8909-a7f92501cb8f'


In [12]:
iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_arn,
)

{'ResponseMetadata': {'RequestId': '0ec7f37a-9e57-4ae9-b2d4-21a4befdd05b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Tue, 16 Apr 2024 12:59:47 GMT',
   'x-amzn-requestid': '0ec7f37a-9e57-4ae9-b2d4-21a4befdd05b',
   'content-type': 'text/xml',
   'content-length': '212'},
  'RetryAttempts': 0}}

In [13]:
#Load cnn dataset from huggingface
dataset = load_dataset("cnn_dailymail",'3.0.0')

In [14]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [15]:
instruction='''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

instruction:

Summarize the news article provided below.

input:

'''

In [16]:
datapoints_train=[]
for dp in dataset['train']:
    temp_dict={}
    temp_dict['prompt']=instruction+dp['article']
    temp_dict['completion']='response:\n\n'+dp['highlights']
    datapoints_train.append(temp_dict)
    

In [17]:
print(datapoints_train[4]['prompt'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

instruction:

Summarize the news article provided below.

input:

(CNN)  -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appear in court Monday. A judge will have the final say on a plea deal. Earlier, Vick admitted to participating in a dogfighting ring as part of a plea agreement with federal prosecutors in Virginia. "Your admitted conduct was not only illegal, but also cruel and reprehensible. Your team, the NFL, and NFL fans have all been hurt by your actions," NFL Commissioner Roger Goodell said in a letter to Vick. Goodell said he would review the status of the suspension after the legal proceedings are over. In papers filed Friday with a federal court in Virginia, Vick also admitted that h

In [18]:
datapoints_valid=[]
for dp in dataset['validation']:
    temp_dict={}
    temp_dict['prompt']=instruction+dp['article']
    temp_dict['completion']='response:\n\n'+dp['highlights']
    datapoints_valid.append(temp_dict)

In [19]:
datapoints_test=[]
for dp in dataset['test']:
    temp_dict={}
    temp_dict['prompt']=instruction+dp['article']
    temp_dict['completion']='response:\n\n'+dp['highlights']
    datapoints_test.append(temp_dict)

In [20]:
def dp_transform(data_points,num_dps,max_dp_length):
    lines=[]
    for dp in data_points:
        if len(dp['prompt']+dp['completion'])<=max_dp_length:
                lines.append(dp)
    random.shuffle(lines)
    lines=lines[:num_dps]
    return lines
    

In [21]:
def jsonl_converter(dataset,file_name):
    print(file_name)
    with jsonlines.open(file_name, 'w') as writer:
        for line in dataset:
            writer.write(line)

In [22]:
train=dp_transform(datapoints_train,5000,3000)
validation=dp_transform(datapoints_valid,999,3000)
test=dp_transform(datapoints_test,10,3000)

In [23]:
dataset_folder="fine-tuning-datasets"
train_file_name="train-cnn-5K.jsonl"
validation_file_name="validation-cnn-1K.jsonl"
test_file_name="test-cnn-10.jsonl"
!mkdir fine-tuning-datasets
abs_path=os.path.abspath(dataset_folder)

mkdir: cannot create directory ‘fine-tuning-datasets’: File exists


In [24]:
jsonl_converter(train,f'{abs_path}/{train_file_name}')
jsonl_converter(validation,f'{abs_path}/{validation_file_name}')
jsonl_converter(test,f'{abs_path}/{test_file_name}')

/root/amazon-bedrock-workshop/03_Model_customization/fine-tuning-datasets/train-cnn-5K.jsonl
/root/amazon-bedrock-workshop/03_Model_customization/fine-tuning-datasets/validation-cnn-1K.jsonl
/root/amazon-bedrock-workshop/03_Model_customization/fine-tuning-datasets/test-cnn-10.jsonl


In [25]:
s3_client.upload_file(f'{abs_path}/{train_file_name}', bucket_name, f'fine-tuning-datasets/train/{train_file_name}')
s3_client.upload_file(f'{abs_path}/{validation_file_name}', bucket_name, f'fine-tuning-datasets/validation/{validation_file_name}')
s3_client.upload_file(f'{abs_path}/{test_file_name}', bucket_name, f'fine-tuning-datasets/test/{test_file_name}')

In [26]:
s3_train_uri=f's3://{bucket_name}/fine-tuning-datasets/train/{train_file_name}'
s3_validation_uri=f's3://{bucket_name}/fine-tuning-datasets/validation/{validation_file_name}'
s3_test_uri=f's3://{bucket_name}/fine-tuning-datasets/test/{test_file_name}'

In [27]:
%store role_arn
%store bucket_name
%store role_name
%store policy_arn
%store s3_train_uri
%store s3_validation_uri
%store s3_test_uri

Stored 'role_arn' (str)
Stored 'bucket_name' (str)
Stored 'role_name' (str)
Stored 'policy_arn' (str)
Stored 's3_train_uri' (str)
Stored 's3_validation_uri' (str)
Stored 's3_test_uri' (str)
