In [1]:
import json
import boto3
import time

from typing import *
import heapq as hp
from collections import deque
from collections import defaultdict
import sys

from datetime import datetime

# a local file with variables and ids
from my_secrets import *

In [2]:
s3_client = boto3.client('s3')
bedrock_runtime = boto3.client('bedrock-runtime')
bedrock = boto3.client('bedrock')
bedrockagent = boto3.client('bedrock-agent')

In [3]:
def get_s3_files(bucket_name, prefix):
    """Get list of files from S3 folder"""
    s3_client = boto3.client('s3')
    files = []
    
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                # Get just the filename from the full path
                filename = obj['Key'].split('/')[-1]
                if filename:  # Ignore empty strings
                    files.append(filename)
    
    return files

def get_missing_files():  
    try:
        # Get lists of files from both folders
        files_folder1 = set(get_s3_files(bucket_name, folder1_prefix))
        files_folder2 = set(get_s3_files(bucket_name, folder2_prefix))
        
        # Find files that are in folder1 but not in folder2
        missing_files = files_folder1 - files_folder2
        
        # Print results
#        print(f"Files in {folder1_prefix} that are not in {folder2_prefix}:")
#        for file in sorted(missing_files):
#            print(f"- {file}")
        print(f"\nTotal missing files: {len(missing_files)}")
        return missing_files
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    return None



In [4]:

def copy_to_kb(filename):
    # the source and destination prefixes
    try:
        filepath = f"{folder1_prefix}{filename}"
        filedestination = f"{folder2_prefix}{filename}"
        # copy the file to the new location
        copy_source = {
            'Bucket': bucket_name,
            'Key': filepath
        }
        s3_client.copy(
            CopySource = copy_source,
            Bucket = bucket_name,
            Key = filedestination
        )
        
    except Exception as e:
        print(f"Error: {str(e)}")


In [5]:
def get_file_size(bucket_name, filename):
    file_key = f"test/store/folder_big_images/{filename}"
    response = s3_client.head_object(Bucket=bucket_name, Key=file_key)
    file_size = response['ContentLength']
    
    # Convert to human readable format
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if file_size < 1024:
            return f"{file_size:.2f} {unit}"
        file_size /= 1024
    return f"{file_size:.2f} PB"


In [6]:

def upload_file():
    missing_files = get_missing_files()
    
    if not missing_files:
        print("No missing files to upload.")
        return
    missingFilesLst = []
    for file in missing_files:
        missingFilesLst.append(file)
    
    file_to_upload = missingFilesLst[0]
    print(f"Uploading the file: {file_to_upload}")
    
    try:
        file_size = get_file_size(bucket_name, file_to_upload)
        print(f"File size: {file_size} bytes")
    except Exception as e:
        print(f"Error getting file size: {e}")
        return

    try:
        copy_to_kb(file_to_upload)
    except Exception as e:
        print(f"Error copying file to KB: {e}")
        return

    # Get current timestamp with microseconds
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S-%f")
    # it has to be a big string
    timestamp += timestamp

    try:
        ingestion_response = bedrockagent.start_ingestion_job(
            clientToken=timestamp,
            dataSourceId=dataSourceId,
            description='incremental ingestion 0',
            knowledgeBaseId=knowledgeBaseId
        )

        job_id = ingestion_response['ingestionJob']['ingestionJobId']  # Changed from dataSourceId to ingestionJobId

        status = 'STARTED'
        while status != 'COMPLETE' and status != 'FAILED':  # Changed from 'COMPLETE' to 'COMPLETED'
            ingestion_job_update = bedrockagent.get_ingestion_job(
                dataSourceId=dataSourceId,
                ingestionJobId=job_id,
                knowledgeBaseId=knowledgeBaseId
            )
            status = ingestion_job_update['ingestionJob']['status']
            print(f"Ingestion job status: {status}")
            time.sleep(10)  # Add a delay to avoid excessive API calls
        if status == 'FAILED':
            print("ingestion job failed")
            ingestion_job_update = bedrockagent.get_ingestion_job(
                dataSourceId=dataSourceId,
                ingestionJobId=job_id,
                knowledgeBaseId=knowledgeBaseId
            )
            failure_reasons = ingestion_job_update['ingestionJob']['failureReasons']
            for reason in failure_reasons:
                print(reason)
            return 0
        print("Ingestion job completed successfully.")
        return len(missing_files)
    except Exception as e:
        print(f"Error during ingestion process: {e}")
        return 0

In [7]:
missing = 1
while missing > 0:
    missing = upload_file()


Total missing files: 40
Uploading the file: bmw-m4-parked-on-a-wet-road-at-night_2.jpg
File size: 362.19 KB bytes
Ingestion job status: STARTING
Ingestion job status: COMPLETE
Ingestion job completed successfully.

Total missing files: 39
Uploading the file: unnamed.jpg
File size: 65.83 KB bytes
Ingestion job status: STARTING
Ingestion job status: COMPLETE
Ingestion job completed successfully.

Total missing files: 38
Uploading the file: manhattan-bridge-new-york-city-united-states-purple-sky-11658x6112-4370.jpg
File size: 3.71 MB bytes
Ingestion job status: STARTING
Ingestion job status: COMPLETE
Ingestion job completed successfully.

Total missing files: 37
Uploading the file: ghost-in-the-shell-4k-wallpapers-v0-tjl9tdybe0xa1.jpg
File size: 2.08 MB bytes
Ingestion job status: STARTING
Ingestion job status: COMPLETE
Ingestion job completed successfully.

Total missing files: 36
Uploading the file: andrew-preble-199410-unsplash.jpg
File size: 7.15 MB bytes
Ingestion job status: STARTI

In [11]:
datasource_FMP = bedrockagent.get_data_source(
    dataSourceId=dataSourceId,
    knowledgeBaseId=knowledgeBaseId
)
print(datasource_FMP['dataSource']['vectorIngestionConfiguration']['parsingConfiguration']['parsingStrategy'])

BEDROCK_FOUNDATION_MODEL
