## 1. Set up

In [1]:
import boto3
import sagemaker
import pandas as pd
from time import gmtime, strftime

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session()
print(bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker-us-east-1-477886989750


In [2]:
num_files=10
num_instances=2

# 2. Generate Data

In [3]:
!mkdir data

In [4]:
#generate a method that generates 10 csv files each of which has two numbers on each line. The files have an ascending name. The files are saved in the directory preprocessed_data
import random
def generate_data():
    for i in range(0,num_files):
        with open(f'data/file{i}.csv', 'w') as f:
            f.write('income\n')
            for j in range(0,2):
                f.write(f'{random.randint(1,100)}\n')
    return True

In [5]:
generate_data()

True

In [9]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
prefix = "sagemaker/spark-preprocess-demo/{}".format(timestamp_prefix)
input_prefix = "{}/input/raw/data".format(prefix)
input_preprocessed_prefix = "{}/input/preprocessed/data".format(prefix)
print(input_prefix)

sagemaker/spark-preprocess-demo/2024-01-20-13-17-41/input/raw/data


In [7]:
#generate a method that takes all the files from the data local folder and saves them in the S3 bucket with name bucket under the prefixe
def upload_data():
    s3_client = boto3.client('s3')
    for i in range(0,10):
        s3_client.upload_file(f'data/file{i}.csv', bucket, f'{input_prefix}/file{i}.csv')
    return "s3://{}/{}/".format(bucket, input_prefix)

In [8]:
data_s3_uri=upload_data()
print (data_s3_uri)

s3://sagemaker-us-east-1-477886989750/sagemaker/spark-preprocess-demo/2024-01-20-13-04-36/input/raw/data/


## 3. Write the Processing Script

In [11]:
!mkdir code

In [12]:
%%writefile ./code/preprocess.py
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import csv
import os
import shutil
import sys
import time

import string
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import sum as _sum


def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_input_bucket", type=str, help="s3 input bucket")
    parser.add_argument("--s3_input_key_prefix", type=str, help="s3 input key prefix")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()

    spark = SparkSession.builder.appName("CSVSum").getOrCreate()

    customSchema = StructType([
        StructField("income", IntegerType(), True)
    ])

    df = spark.read.format("csv").schema(customSchema).load("s3://" + os.path.join(args.s3_input_bucket, args.s3_input_key_prefix)).select("*", "_metadata.file_name")

    sum_df = df.groupBy('file_name').agg(_sum('income').alias('income_sum'))
    processed_rdd = sum_df.rdd
    processed_rdd.coalesce(1).saveAsTextFile("s3://" + os.path.join(args.s3_output_bucket, args.s3_output_key_prefix, "processed"))

if __name__ == "__main__":
    main()

Writing ./code/preprocess.py


## 4. Run the Processing with Amazon SageMaker

In [None]:
# Upload the raw input dataset to a unique S3 location
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())


spark_processor = PySparkProcessor(
    base_job_name="sm-spark",
    framework_version="3.3",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=1200,
)

configuration = [
    {
        "Classification": "spark-defaults",
        "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"},
    }
]

spark_processor.run(
    submit_app="./code/preprocess.py",
    arguments=[
        "--s3_input_bucket",
        bucket,
        "--s3_input_key_prefix",
        input_prefix,
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        input_preprocessed_prefix,
    ],
    configuration=configuration,
    logs=False
)
    