Install Terraform
https://www.hashicorp.com/official-packaging-guide

In [None]:
%%writefile installterraform.sh
wget -O- https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo | sudo tee /etc/yum.repos.d/hashicorp.repo
wget -O- https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo | sudo tee /etc/yum.repos.d/hashicorp.repo
yum -y install terraform

Using terminal run the commands `sudo chmod +x installterraform.sh` and then `sudo ./installterraform.sh`

Add following policy to sagemaker execution role:

```json
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "iam:*",
                "secretsmanager:*",
                "es:*",
                "bedrock:*",
                "sts:*"
            ],
            "Effect": "Allow",
            "Resource": "*"
        }
    ]
}
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "BedrockFullAccess",
            "Effect": "Allow",
            "Action": "bedrock:*",
            "Resource": "*"
        }
    ]
}
```

In [None]:
!./download-beta-sdk.sh
!pip install -r requirements.txt

In [None]:
import os, sys, json
from utils import secret, opensearch, dataset, bedrock

In [None]:
opensearch_username =  opensearch_index = 'rag'
early_stop_record_count = 100
os.environ['AWS_DEFAULT_REGION'] = 'us-west-2'
region = os.environ.get('AWS_DEFAULT_REGION')

## Create Embeddings in OpenSearch using Titan Embeddings Model
### Download Dataset

In [None]:
def download_dataset():
    dataset_url = "https://huggingface.co/datasets/sentence-transformers/embedding-training-data/resolve/main/gooaq_pairs.jsonl.gz"
    archive = dataset.download_dataset(dataset_url)
    if archive is not None:
        file_path = dataset.decompress_dataset(archive)
        dataset.delete_file(archive)
    return file_path

### Initialize OpenSearch

In [None]:
opensearch_secret = secret.get_secret(opensearch_username, region)
opensearch_client = opensearch.get_opensearch_cluster_client(opensearch_index, opensearch_secret, region)

In [None]:
response = opensearch.delete_opensearch_index(opensearch_client, opensearch_index)
if response:
    print(f'Deleted existing index: {opensearch_index}')

In [None]:
exists = opensearch.check_opensearch_index(opensearch_client, opensearch_index)  
if not exists:
    status = opensearch.create_index(opensearch_client, opensearch_index)
    if status:
        success = opensearch.create_index_mapping(opensearch_client, opensearch_index)

### Connect to Amazon Bedrock

In [None]:
def get_bedrock_client(region, url):
    module_path = "."
    sys.path.append(os.path.abspath(module_path))
    boto3_bedrock = bedrock.get_bedrock_client(
        region = region,
        url_override = url
    )
    return boto3_bedrock

In [None]:
endpoint_url = '<Your endpont url for bedrock>'
bedrock_client = get_bedrock_client(region,endpoint_url)
bedrock_client.list_foundation_models()

In [None]:
%%time
dataset_path = download_dataset()
if dataset_path is not None:
    records = dataset.prep_for_put(dataset_path)

dataset.delete_file(dataset_path)
print(f'Record count: {len(records)}')

### Create Embeddings using Amazon Bedrock

In [None]:
%%time
embedded_records = []
# Embeddings info https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/ 
# Base Model ID for titan embeddings https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids-arns.htmlfor record in records[:100]:

    payload = {"inputText":f"{record}"}
    body = json.dumps(payload)
    modelId ="amazon.titan-embed-text-v1"  # amazon.titan-e1t-medium 
    accept = "application/json"
    contentType = "application/json"
    
    response = bedrock_client.invoke_model(
        body=body, modelId=modelId, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())

    embedding = response_body.get("embedding")
    embedded_records.append(
        {"_index": opensearch_index, "text": record, "vector_field": embedding}
    )

In [None]:
len(embedded_records)

### Save embeddings to OpenSearch

In [None]:
success, failed = opensearch.put_bulk_in_opensearch(embedded_records, opensearch_client)
print(f"Documents saved {success}, documents failed to save {len(failed)}")

### Store variables for reuse in rag2.ipynb

In [None]:
%store \
region endpoint_url opensearch_client opensearch_username opensearch_secret opensearch_index