# Training a custom model with Amazon Bedrock

In [28]:
import json
from pathlib import Path

import tiktoken


def count_tokens(text, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


def split_text(text, max_tokens=600, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = encoding.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks


def process_files(data_dir, output_file):
    with open(output_file, "w") as f:
        for path in Path(data_dir).rglob("*.md"):
            with open(path, "r") as file:
                text = file.read()
                chunks = split_text(text)
                for chunk in chunks:
                    f.write(json.dumps({"input": chunk}) + "\n")


In [29]:


data_dir = (
    "/Users/strickvl/coding/zenml/repos/zenml-projects/bedrock-rag-llm/data"
)
output_file = "pretraining_inputs.jsonl"

process_files(data_dir, output_file)

## Upload to S3

In [30]:
from zenml.client import Client
from zenml.service_connectors.service_connector import ServiceConnector


def get_boto_client() -> ServiceConnector:
    zc = Client()
    return zc.get_service_connector_client(
        name_id_or_prefix="0b04bcae-efc9-4044-a1c2-b86281cb0820",  # TODO: pull this out into config file
        resource_type="aws-generic",
    ).connect()


# Upload the JSONL file to S3
def upload_to_s3(file_path, bucket_name, object_key):
    boto_client = get_boto_client()
    s3_client = boto_client.client("s3")
    s3_client.upload_file(file_path, bucket_name, object_key)



In [31]:

# Specify the S3 bucket and object key
bucket_name = "bedrock-zenml-rag-docs"
object_key = "pretraining_inputs.jsonl"

# Upload the JSONL file to S3
upload_to_s3(output_file, bucket_name, object_key)

## Setup for Custom Model Training

In [56]:
import warnings
warnings.filterwarnings('ignore')
import json
import os
import sys
import boto3
import time
import pprint
import random

boto_client = get_boto_client()
sts_client = boto_client.client('sts')
region = "us-east-1"

account_id = sts_client.get_caller_identity()["Account"]
s3_suffix = f"{region}-{account_id}"
bucket_name = f"bedrock-customization-{s3_suffix}"
s3_client = boto_client.client('s3')
bedrock = boto_client.client(service_name="bedrock", region_name=region)
bedrock_runtime = boto_client.client(service_name="bedrock-runtime", region_name=region)
iam = boto_client.client('iam', region_name=region)

In [41]:
role_name = "AmazonBedrockCustomizationRole1"
s3_bedrock_finetuning_access_policy="AmazonBedrockCustomizationPolicy1"
customization_role = f"arn:aws:iam::{account_id}:role/{role_name}"

In [57]:
models_fine_tuning = [ model['modelId'] for model in bedrock.list_foundation_models(byCustomizationType="FINE_TUNING")["modelSummaries"]]
for model in models_fine_tuning:
    print(model)

amazon.titan-image-generator-v1:0
amazon.titan-image-generator-v2:0
amazon.titan-text-lite-v1:0:4k
amazon.titan-text-express-v1:0:8k
amazon.titan-embed-image-v1:0
cohere.command-text-v14:7:4k
cohere.command-light-text-v14:7:4k
meta.llama2-13b-v1:0:4k
meta.llama2-70b-v1:0:4k


In [47]:
ROLE_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Principal": {{
                "Service": "bedrock.amazonaws.com"
            }},
            "Action": "sts:AssumeRole",
            "Condition": {{
                "StringEquals": {{
                    "aws:SourceAccount": "{account_id}"
                }},
                "ArnEquals": {{
                    "aws:SourceArn": "arn:aws:bedrock:{region}:{account_id}:model-customization-job/*"
                }}
            }}
        }}
    ]
}}
"""

ACCESS_POLICY_DOC = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "s3:AbortMultipartUpload",
                "s3:DeleteObject",
                "s3:PutObject",
                "s3:GetObject",
                "s3:GetBucketAcl",
                "s3:GetBucketNotification",
                "s3:ListBucket",
                "s3:PutBucketNotification"
            ],
            "Resource": [
                "arn:aws:s3:::{bucket_name}",
                "arn:aws:s3:::{bucket_name}/*"
            ]
        }}
    ]
}}"""

In [48]:
response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=ROLE_DOC,
    Description="Role for Bedrock to access S3 for finetuning",
)
pprint.pp(response)

{'Role': {'Path': '/',
          'RoleName': 'AmazonBedrockCustomizationRole1',
          'RoleId': 'AROAU6GDV3UC73JYBCTEW',
          'Arn': 'arn:aws:iam::339712793861:role/AmazonBedrockCustomizationRole1',
          'CreateDate': datetime.datetime(2024, 9, 6, 15, 41, 11, tzinfo=tzutc()),
          'AssumeRolePolicyDocument': {'Version': '2012-10-17',
                                       'Statement': [{'Effect': 'Allow',
                                                      'Principal': {'Service': 'bedrock.amazonaws.com'},
                                                      'Action': 'sts:AssumeRole',
                                                      'Condition': {'StringEquals': {'aws:SourceAccount': '339712793861'},
                                                                    'ArnEquals': {'aws:SourceArn': 'arn:aws:bedrock:us-east-1:339712793861:model-customization-job/*'}}}]}},
 'ResponseMetadata': {'RequestId': 'c1c31060-3bd8-4d3e-9eaa-7a041ca3d988',
              

In [49]:
role_arn = response["Role"]["Arn"]
pprint.pp(role_arn)

'arn:aws:iam::339712793861:role/AmazonBedrockCustomizationRole1'


In [50]:
response = iam.create_policy(
    PolicyName=s3_bedrock_finetuning_access_policy,
    PolicyDocument=ACCESS_POLICY_DOC,
)
pprint.pp(response)

{'Policy': {'PolicyName': 'AmazonBedrockCustomizationPolicy1',
            'PolicyId': 'ANPAU6GDV3UCWBF6JOAOA',
            'Arn': 'arn:aws:iam::339712793861:policy/AmazonBedrockCustomizationPolicy1',
            'Path': '/',
            'DefaultVersionId': 'v1',
            'AttachmentCount': 0,
            'PermissionsBoundaryUsageCount': 0,
            'IsAttachable': True,
            'CreateDate': datetime.datetime(2024, 9, 6, 15, 41, 31, tzinfo=tzutc()),
            'UpdateDate': datetime.datetime(2024, 9, 6, 15, 41, 31, tzinfo=tzutc())},
 'ResponseMetadata': {'RequestId': '27804fe8-8b6f-4d63-9980-ec100b7b2c67',
                      'HTTPStatusCode': 200,
                      'HTTPHeaders': {'date': 'Fri, 06 Sep 2024 15:41:31 GMT',
                                      'x-amzn-requestid': '27804fe8-8b6f-4d63-9980-ec100b7b2c67',
                                      'content-type': 'text/xml',
                                      'content-length': '801'},
                      

In [51]:
policy_arn = response["Policy"]["Arn"]
pprint.pp(policy_arn)

'arn:aws:iam::339712793861:policy/AmazonBedrockCustomizationPolicy1'


In [52]:
iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_arn,
)

{'ResponseMetadata': {'RequestId': 'a41a5ab2-cae5-4ec4-8cc7-27b0a4552a4f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Fri, 06 Sep 2024 15:41:48 GMT',
   'x-amzn-requestid': 'a41a5ab2-cae5-4ec4-8cc7-27b0a4552a4f',
   'content-type': 'text/xml',
   'content-length': '212'},
  'RetryAttempts': 0}}

## Create Custom Model

In [45]:
boto_client = get_boto_client()
bedrock_client = boto_client.client("bedrock", region_name=region)


In [58]:
from datetime import datetime
ts = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

pretraining_data_filename = "pretraining_inputs.jsonl"
bucket_name = "bedrock-zenml-rag-docs"
base_model_id = "amazon.titan-text-express-v1:0:8k"
customization_type = "FINE_TUNING"

response = bedrock_client.create_model_customization_job(
    jobName=f"my-custom-model-finetune-job-{ts}",
    customModelName="my-custom-model-titan-text-express-v1",
    customizationType=customization_type,
    roleArn=role_arn,
    baseModelIdentifier=base_model_id,
    jobTags=[{"key": "z-owner", "value": "alex-strick"}],
    customModelTags=[{"key": "z-owner", "value": "alex-strick"}],
    trainingDataConfig={
        "s3Uri": f"s3://{bucket_name}/{pretraining_data_filename}"
    },
    outputDataConfig={
        "s3Uri": f"s3://{bucket_name}"
    },
    hyperParameters={
        "learningRate": "0.00001",
        "epochCount": "5",
        "batchSize": "1",
        "learningRateWarmupSteps": "5",
    }
)

job_arn = response["jobArn"]
print(f"Model customization job ARN: {job_arn}")

Model customization job ARN: arn:aws:bedrock:us-east-1:339712793861:model-customization-job/amazon.titan-text-express-v1:0:8k/2fb41q1dfsat
