# Use this notebook to 
 - download the contents of an Object store bucket (source) to a local folder within Jupyter Hub (download_to)
 - then push those contents to another Object store bucket (target)

This *substantially faster* than downloading and uploading manually.

### You will need to make substitutions take actions in the cell marked **MAKE CHANGES HERE** below 

In [1]:
import os
import boto3

##### The next cell requires you to have created an OpenShift AI Data Connection to Object storage (Minio) 
##### inside your OpenShift AI Project. If you have not, manually override them (*source_region* should be 'none')

In [2]:
source_key_id = os.getenv("AWS_ACCESS_KEY_ID")
source_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
source_region = os.getenv("AWS_DEFAULT_REGION")
source_endpoint = os.getenv("AWS_S3_ENDPOINT")
source_bucket_name = os.getenv("AWS_S3_BUCKET")

print (source_key_id)
print (source_secret_key)
print (source_region)
print (source_endpoint)
print (source_bucket_name)


minio
minio123
none
https://minio-api-minio.apps.rosa-55nsv.lax9.p1.openshiftapps.com
models


# **MAKE CHANGES HERE**  

Details of what you need to do: 
- *download_to* is the name of the local directory that will be created here in Jupyter Hub to download to and upload from
- *source_bucket* and *source_subfolder* are the bucket and subfolder in the Object Storage you will pull from
- *target_bucket* and *target_subfolder* are the bucket and subfolder in the Object Storage you will push to

## Note - before running the notebook, you should delete the **download_to** if it's already there

In [3]:
download_to="<INSERT YOURS>"

source_bucket = "<INSERT YOURS>"
source_subfolder = "<INSERT YOURS>"

target_bucket = "<INSERT YOURS>"
target_subfolder = "<INSERT YOURS>"
target_url = "<INSERT YOURS>"
target_key_id = "<INSERT YOURS>"
target_secret_key = "<INSERT YOURS>"
target_endpoint = "<INSERT YOURS>"


# Example entries (URLs will be invalid for you)
download_to="download_to"

source_bucket = "models"
source_subfolder = "granite-7b-lab/"

target_bucket = "models-target"
target_subfolder = "granite-7b-lab/"
target_url = "https://minio-api-minio.apps.rosa-55nsv.lax9.p1.openshiftapps.com"
target_key_id = source_key_id
target_secret_key = source_secret_key
target_endpoint = target_url


In [4]:
s3 = boto3.client(
   "s3",
   aws_access_key_id=source_key_id,
   aws_secret_access_key=source_secret_key,
   endpoint_url=source_endpoint,
   verify=True)



In [5]:
if not os.path.exists(download_to):
  os.mkdir(download_to) 

## Download from source Object Storage

In [6]:
import boto3
import os

def download_s3_folder(bucket_name, folder_name, local_dir):

    # List all objects in the specified folder
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=folder_name)

    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                key = obj['Key']
                if not key.endswith('/'):
                    local_file_path = os.path.join(local_dir, key[len(folder_name):])
                    local_file_dir = os.path.dirname(local_file_path)

                    if not os.path.exists(local_file_dir):
                        os.makedirs(local_file_dir)
                    
                    print(f"Downloading {key} to {local_file_path}")
                    s3.download_file(bucket_name, key, local_file_path)



# Example usage
bucket_name = source_bucket
folder_name = source_subfolder  # Ensure it ends with a slash
local_dir = download_to

download_s3_folder(bucket_name, folder_name, local_dir)


Downloading granite-7b-lab/.gitattributes to download_to/.gitattributes
Downloading granite-7b-lab/README.md to download_to/README.md
Downloading granite-7b-lab/added_tokens.json to download_to/added_tokens.json
Downloading granite-7b-lab/config.json to download_to/config.json
Downloading granite-7b-lab/generation_config.json to download_to/generation_config.json
Downloading granite-7b-lab/model-00001-of-00003.safetensors to download_to/model-00001-of-00003.safetensors
Downloading granite-7b-lab/model-00002-of-00003.safetensors to download_to/model-00002-of-00003.safetensors
Downloading granite-7b-lab/model-00003-of-00003.safetensors to download_to/model-00003-of-00003.safetensors
Downloading granite-7b-lab/model.safetensors.index.json to download_to/model.safetensors.index.json
Downloading granite-7b-lab/paper.pdf to download_to/paper.pdf
Downloading granite-7b-lab/special_tokens_map.json to download_to/special_tokens_map.json
Downloading granite-7b-lab/tokenizer.json to download_to/t

## Upload to target Object Storage

In [None]:
target_s3 = boto3.client(
   "s3",
   aws_access_key_id=target_key_id,
   aws_secret_access_key=target_secret_key,
   endpoint_url=target_endpoint,
   verify=True)

Direc = download_to

files = os.listdir(Direc)
files = [f for f in files if os.path.isfile(Direc+'/'+f)]

for filename in files:
  print(filename)
  target_s3.upload_file(download_to+"/"+filename, target_bucket, target_subfolder+filename)

added_tokens.json
special_tokens_map.json
config.json
model.safetensors.index.json
model-00002-of-00003.safetensors


In [None]:
print ("Done")