# Training a custom model with Amazon Bedrock

In [1]:
import json
from pathlib import Path

import tiktoken


def count_tokens(text, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


def split_text(text, max_tokens=600, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = encoding.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks


def process_files(data_dir, output_file):
    with open(output_file, "w") as f:
        for path in Path(data_dir).rglob("*.md"):
            with open(path, "r") as file:
                text = file.read()
                chunks = split_text(text)
                for chunk in chunks:
                    f.write(json.dumps({"input": chunk}) + "\n")


data_dir = (
    "/Users/strickvl/coding/zenml/repos/zenml-projects/bedrock-rag-llm/data"
)
output_file = "pretraining_inputs.jsonl"

process_files(data_dir, output_file)

## Upload to S3

In [2]:
from zenml.client import Client
from zenml.service_connectors.service_connector import ServiceConnector


def get_boto_client() -> ServiceConnector:
    zc = Client()
    return zc.get_service_connector_client(
        name_id_or_prefix="0b04bcae-efc9-4044-a1c2-b86281cb0820",  # TODO: pull this out into config file
        resource_type="aws-generic",
    ).connect()


# Upload the JSONL file to S3
def upload_to_s3(file_path, bucket_name, object_key):
    boto_client = get_boto_client()
    s3_client = boto_client.client("s3")
    s3_client.upload_file(file_path, bucket_name, object_key)


# Specify the S3 bucket and object key
bucket_name = "bedrock-zenml-rag-docs"
object_key = "pretraining_inputs.jsonl"

# Upload the JSONL file to S3
upload_to_s3(output_file, bucket_name, object_key)

[33mCould not import GCP service connector: No module named 'google.api_core'.[0m
[33mCould not import Azure service connector: No module named 'azure'.[0m
[33mCould not import HyperAI service connector: No module named 'paramiko'.[0m
