In [2]:
!pip install -U "sagemaker>2.0"
!pip install --upgrade sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [4]:
import logging
import sagemaker
from time import gmtime, strftime
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
import io
import os
import sys
import boto3
import pickle
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from numpy.lib.stride_tricks import sliding_window_view

sagemaker_logger = logging.getLogger("sagemaker")
sagemaker_logger.setLevel(logging.INFO)
sagemaker_logger.addHandler(logging.StreamHandler())

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

In [5]:
df = pd.read_csv("s3://training-data-lstm/processed_training_data.csv/")

In [6]:
df = df.drop(columns =  "Unnamed: 0")

In [7]:
df.head(2)

Unnamed: 0,keywordId,date,clicks,impressions,orders,budget,campaign_sales_perc,account_sales_perc,campaign_spend_perc,account_spend_perc,...,targeting_type,budget_type,adFormat,tactic,costType,cpc,year,month,day,dayoftheweek
0,45328926266934,2023-01-13,0,37,0,500.0,0.0,0.0,0.0,0.0,...,2,0,2,0,0,0.0,2023,1,13,4
1,72423678058542,2023-01-13,4,131,0,500.0,0.0,0.0,0.023576,0.013418,...,2,0,2,0,0,0.036826,2023,1,13,4


In [8]:
%%writefile preprocess.py

HISTORICAL_DATA_WINDOW = 14
FUTURE_PREDICTION_WINDOW = 3

train_frames_x_dir = "./train_frames_x"
test_frames_x_dir = "./test_frames_x"
train_frames_y_dir = "./train_frames_y"
test_frames_y_dir = "./test_frames_y"
train_frames_embed_dir = "./train_frames_embed"
test_frames_embed_dir = "./test_frames_embed"
train_frames_decoder_input_dir = "./train_frames_decoder_input"
test_frames_decoder_input_dir = "./test_frames_decoder_input"


def create_training_frames(key, iterator):
    partition = pd.concat(iterator)
    partition.drop(columns=["Unnamed: 0"], inplace=True)
    partition.sort_values(by=["date"], inplace=True)
    split_index = partition.iloc[0]["keywordId"]
    
    
    train_frames_X, train_frames_Y = [], []
    test_frames_X, test_frames_Y = [], []
    train_frames_embed, test_frames_embed = [], []
    train_frames_decoder_input, test_frames_decoder_input = [], []

    train_size = (int)(len(partition) * 0.7)
    train_data = partition[:train_size]
    test_data = partition[train_size - HISTORICAL_DATA_WINDOW - FUTURE_PREDICTION_WINDOW + 1:]
    data_columns = partition.columns
    
    if train_data_arr.shape[0] < HISTORICAL_DATA_WINDOW or test_data_arr.shape[0] < HISTORICAL_DATA_WINDOW:
        return []
    
    train_data_frame_X = sliding_window_view(train_data_arr, window_shape = (HISTORICAL_DATA_WINDOW, train_data_arr.shape[1]))
    test_data_frame_X = sliding_window_view(test_data_arr, window_shape = (HISTORICAL_DATA_WINDOW, test_data_arr.shape[1]))
    train_data_frame_X = np.squeeze(train_data_frame_X)[:-FUTURE_PREDICTION_WINDOW]
    test_data_frame_X = np.squeeze(test_data_frame_X)[:-FUTURE_PREDICTION_WINDOW]
    train_data_frames_X, test_data_frames_X = train_data_frame_X.copy(), test_data_frame_X.copy()
    for frame in train_data_frames_X:
        train_frames_X.append(pd.DataFrame(frame, columns=data_columns))
    for frame in test_data_frames_X:
        test_frames_X.append(pd.DataFrame(frame, columns=data_columns))

    train_data_frame_Y = sliding_window_view(train_data_arr, window_shape = (FUTURE_PREDICTION_WINDOW, train_data_arr.shape[1]))
    test_data_frame_Y = sliding_window_view(test_data_arr, window_shape = (FUTURE_PREDICTION_WINDOW, test_data_arr.shape[1]))
    train_data_frame_Y = np.squeeze(train_data_frame_Y)[HISTORICAL_DATA_WINDOW:]
    test_data_frame_Y = np.squeeze(test_data_frame_Y)[HISTORICAL_DATA_WINDOW:]
    train_data_frames_Y, test_data_frames_Y = train_data_frame_Y.copy(), test_data_frame_Y.copy()
    for frame in train_data_frames_Y:
        train_frames_Y.append(pd.DataFrame(frame, columns=data_columns))
    for frame in test_data_frames_Y:
        test_frames_Y.append(pd.DataFrame(frame, columns=data_columns))   
    
    
    embedding_columns = ["keyword_length", "keyword_num_words", "budget", "matchType", "country_code", "campaign_type",
                         "targeting_type", "budget_type", "adFormat", "tactic", "costType"]

    for i in range(len(train_frames_X)):
        train_frames_embed.append(train_frames_X[i].loc[0][embedding_columns])
        train_frames_X[i].drop(columns=embedding_columns, inplace=True)

    for i in range(len(test_frames_X)):
        test_frames_embed.append(test_frames_X[i].loc[0][embedding_columns])
        test_frames_X[i].drop(columns=embedding_columns, inplace=True)

    for i in range(len(train_frames_X)):
        train_frames_X[i].drop(columns=["keywordId", "date"], inplace=True)
        train_frames_Y[i].drop(columns=embedding_columns, inplace=True)
        train_frames_Y[i].drop(
            columns=["keywordId", "date", "year", "month", "day", "dayoftheweek", "clicks", "impressions", "orders",
                     "campaign_sales_perc", "campaign_spend_perc", "account_sales_perc", "account_spend_perc"],
            inplace=True)

    for i in range(len(test_frames_X)):
        test_frames_X[i].drop(columns=["keywordId", "date"], inplace=True)
        test_frames_Y[i].drop(columns=embedding_columns, inplace=True)
        test_frames_Y[i].drop(
            columns=["keywordId", "date", "year", "month", "day", "dayoftheweek", "clicks", "impressions", "orders",
                     "campaign_sales_perc", "campaign_spend_perc", "account_sales_perc", "account_spend_perc"],
            inplace=True)

    for i in range(len(train_frames_Y)):
        train_frames_decoder_input.append(train_frames_Y[i]["cpc"])
        train_frames_Y[i].drop(columns=["cpc"], inplace=True)

    for i in range(len(test_frames_Y)):
        test_frames_decoder_input.append(test_frames_Y[i]["cpc"])
        test_frames_Y[i].drop(columns=["cpc"], inplace=True)

    train_frames_X, train_frames_Y = np.array(train_frames_X), np.array(train_frames_Y)
    test_frames_X, test_frames_Y = np.array(test_frames_X), np.array(test_frames_Y)
    train_frames_embed, test_frames_embed = np.array(train_frames_embed), np.array(test_frames_embed)
    train_frames_decoder_input, test_frames_decoder_input = np.array(train_frames_decoder_input), np.array(
        test_frames_decoder_input)

    if not os.path.exists(train_frames_x_dir):
        os.makedirs(train_frames_x_dir)
    if not os.path.exists(test_frames_x_dir):
        os.makedirs(test_frames_x_dir)
    if not os.path.exists(train_frames_y_dir):
        os.makedirs(train_frames_y_dir)
    if not os.path.exists(test_frames_y_dir):
        os.makedirs(test_frames_y_dir)
    if not os.path.exists(train_frames_embed_dir):
        os.makedirs(train_frames_embed_dir)
    if not os.path.exists(test_frames_embed_dir):
        os.makedirs(test_frames_embed_dir)
    if not os.path.exists(train_frames_decoder_input_dir):
        os.makedirs(train_frames_decoder_input_dir)
    if not os.path.exists(test_frames_decoder_input_dir):
        os.makedirs(test_frames_decoder_input_dir)

    np.save(os.path.join(train_frames_x_dir, f"par_{split_index}.npy"), train_frames_X)
    np.save(os.path.join(test_frames_x_dir, f"par_{split_index}.npy"), test_frames_X)
    np.save(os.path.join(train_frames_y_dir, f"par_{split_index}.npy"), train_frames_Y)
    np.save(os.path.join(test_frames_y_dir, f"par_{split_index}.npy"), test_frames_Y)
    np.save(os.path.join(train_frames_embed_dir, f"par_{split_index}.npy"), train_frames_X)
    np.save(os.path.join(test_frames_embed_dir, f"par_{split_index}.npy"), test_frames_X)
    np.save(os.path.join(train_frames_decoder_input_dir, f"par_{split_index}.npy"), train_frames_Y)
    np.save(os.path.join(test_frames_decoder_input_dir, f"par_{split_index}.npy"), test_frames_Y)

    return 0



def main():
    
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_input_bucket", type=str, help="s3 input bucket")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    args,_ = parser.parse_known_args()
    
    save_bucket_name = args.s3_output_bucket
    spark = SparkSession.builder.appName("PySparkApp").getOrCreate()
    
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=args.s3_input_bucket, Key='processed_training_data.csv')
    dataset = pd.read_csv(obj['Body'], nrows=10)
    data_columns = dataset.columns
    spark_df = spark.createDataFrame(dataset)
    
    # Convert Spark DataFrame to Pandas DataFrame
    pandas_df = spark_df.toPandas()

    # Convert Pandas DataFrame back to Spark DataFrame and repartition
    spark_df = spark.createDataFrame(pandas_df)
    spark_df = spark_df.repartition(spark_df.rdd.getNumPartitions())

    # Apply the function to each partition using Spark
    spark_df.rdd.mapPartitionsWithIndex(create_training_frames).collect()
    
    # Apply the function to each partition using Spark
    spark.sparkContext.parallelize(partitions).mapPartitionsWithIndex(create_training_frames).collect()
    
    # Upload the resulting files to S3
    s3_client = boto3.client('s3')
    
    def upload_pickle(data, bucket, key):
        my_array_data = io.BytesIO()
        pickle.dump(data, my_array_data)
        my_array_data.seek(0)
        s3_client.upload_fileobj(my_array_data, bucket, key)
        
    files = sorted(glob.glob(train_frames_x_dir + '/*.npy'))
    train_frames_X = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(train_frames_X, save_bucket_name, "train_frames_x.pkl")

    files = sorted(glob.glob(train_frames_y_dir + '/*.npy'))
    train_frames_Y = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(train_frames_Y, save_bucket_name, "train_frames_y.pkl")

    files = sorted(glob.glob(train_frames_embed_dir + '/*.npy'))
    train_frames_embed = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(train_frames_embed, save_bucket_name, "train_frames_embed.pkl")

    files = sorted(glob.glob(train_frames_decoder_input_dir + '/*.npy'))
    train_frames_decoder_input = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(train_frames_decoder_input, save_bucket_name, "train_frames_decoder_input.pkl")

    files = sorted(glob.glob(test_frames_x_dir + '/*.npy'))
    test_frames_X = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(test_frames_X, save_bucket_name, "test_frames_x.pkl")

    files = sorted(glob.glob(test_frames_y_dir + '/*.npy'))
    test_frames_Y = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(test_frames_Y, save_bucket_name, "test_frames_y.pkl")

    files = sorted(glob.glob(test_frames_embed_dir + '/*.npy'))
    test_frames_embed = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(test_frames_embed, save_bucket_name, "test_frames_embed.pkl")

    files = sorted(glob.glob(test_frames_decoder_input_dir + '/*.npy'))
    test_frames_decoder_input = np.concatenate([np.load(f) for f in files], axis=0)
    upload_pickle(test_frames_decoder_input, save_bucket_name, "test_frames_decoder_input.pkl")

    return

if __name__ == "__main__":
    main()

Overwriting preprocess.py


In [9]:
from sagemaker.processing import FrameworkProcessor
#sklearn_processor = SKLearnProcessor(
#    framework_version="0.20.0", 
#    role=role, 
#    instance_type="ml.t3.medium", 
#    instance_count=2,
#    max_runtime_in_seconds=1200,
#    sagemaker_session = sagemaker_session,
#)
est_cls = sagemaker.sklearn.estimator.SKLearn
framework_version_str = "0.20.0"

script_processor = FrameworkProcessor(
    role=role,
    instance_count=2,
    instance_type="ml.t3.medium",
    estimator_cls=est_cls,
    framework_version=framework_version_str,
)

In [None]:
sklearn_test = "./sklearn_test"
if not os.path.exists(sklearn_test):
    os.makedirs(sklearn_test)

In [None]:
sklearn_processor.run(
    code="preprocess.py",
    #inputs=[ProcessingInput(source=df, destination="/opt/ml/processing/sklearn_test")],
)

In [11]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

script_processor.run(
    code="preprocess.py",
    #source_dir="code",
    #inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    #outputs=[
    #    ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
    #    ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
    #],
    #arguments=["--train-test-split-ratio", "0.2"],
)
script_processor_job_description = script_processor.jobs[-1].describe()
print(script_processor_job_description)


Uploaded None to s3://sagemaker-eu-west-1-321097665711/sklearn-2023-07-20-04-15-05-452/source/sourcedir.tar.gz
Uploaded None to s3://sagemaker-eu-west-1-321097665711/sklearn-2023-07-20-04-15-05-452/source/sourcedir.tar.gz
runproc.sh uploaded to s3://sagemaker-eu-west-1-321097665711/sklearn-2023-07-20-04-15-05-452/source/runproc.sh
runproc.sh uploaded to s3://sagemaker-eu-west-1-321097665711/sklearn-2023-07-20-04-15-05-452/source/runproc.sh
Creating processing-job with name sklearn-2023-07-20-04-15-05-452
Creating processing-job with name sklearn-2023-07-20-04-15-05-452
INFO:sagemaker:Creating processing-job with name sklearn-2023-07-20-04-15-05-452


Using provided s3_resource
........................................................................
[34mTraceback (most recent call last):
  File "preprocess.py", line 200, in <module>
    main()
  File "preprocess.py", line 129, in main
    parser = argparse.ArgumentParser(description="app inputs and outputs")[0m
[34mNameError: name 'argparse' is not defined[0m


UnexpectedStatusException: Error for Processing job sklearn-2023-07-20-04-15-05-452: Failed. Reason: AlgorithmError: See job logs for more information