## Exercise Solution: Upload (again, again) to S3

In [None]:
import boto3
import json
import os
import zipfile
from botocore.exceptions import ClientError

# Input the s3 bucket
BUCKET = "udacity-sagemaker-solutiondata2021"
# Input the s3 prefix
s3_prefix = "l2e3"
# Input the the file to write the data to
file_name = "music_instruments_reviews.txt"


def unzip_data(input_data_path):
    with zipfile.ZipFile(input_data_path, 'r') as input_data_zip:
        input_data_zip.extractall('.')

def split_sentences(input_data):
    split_sentences = []
    for l in open(input_data, 'r'):
        l_object = json.loads(l)
        helpful_votes = float(l_object['helpful'][0])
        total_votes = l_object['helpful'][1]
        if total_votes != 0 and helpful_votes/total_votes != .5:  # Filter out same data as prior jobs. 
            reviewText = l_object['reviewText']
            sentences = reviewText.split(".") 
            for s in sentences:
                if s: # Make sure sentences isn't empty. Common w/ "..."
                    split_sentences.append(s)
    return split_sentences

# Format the data as  {'source': 'THIS IS A SAMPLE SENTENCE'}
# And write the data into a file
def cycle_data(fp, data):
    for d in data:
        fp.write(json.dumps({'source':d}) + '\n')

# upload the data to s3
def upload_file_to_s3(file_name, s3_prefix):
    object_name = os.path.join(s3_prefix, file_name)
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, BUCKET, object_name)
    except ClientError as e:
        logging.error(e)
        return False

# Unzip archive
unzip_data('reviews_Musical_Instruments_5.json.zip')

# Preprocess reviews_Musical_Instruments_5.json
sentences = split_sentences('reviews_Musical_Instruments_5.json')

# Write data to a file and upload it to s3.   
with open(file_name, 'w') as f:
    cycle_data(f, sentences)

upload_file_to_s3(file_name, s3_prefix)

# Get the s3 path for the data
batch_transform_input_path = "s3://" +  "/".join([BUCKET, s3_prefix, file_name])

print(batch_transform_input_path)


## Exercise Solution: Use Batch Transform 

In [None]:
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker import image_uris

role = get_execution_role()

image_uri = image_uris.retrieve(framework='blazingtext',region='us-west-2')

model_data = "s3://udacity-sagemaker-solutiondata2021/l2e1/model_artifact/hello-blaze2021-2/output/model.tar.gz"

batch_transform_output_path = "s3://udacity-sagemaker-solutiondata2021/l2e3/batchtransform_output"

model = Model(image_uri=image_uri, model_data=model_data, role=role)

transformer = model.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge', 
    output_path=batch_transform_output_path
    
)

transformer.transform(
    data=batch_transform_input_path, 
    data_type='S3Prefix',
    content_type='application/jsonlines', 
    split_type='Line'
)

transformer.wait()