In [1]:
import os
import sys
import boto3

import numpy as np
import pandas as pd

In [36]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='training-data-lstm', Key = 'processed_training_data.csv')
dataset = pd.read_csv(obj['Body'], nrows=100000)

In [37]:
keyword_data_desc_order = dataset["keywordId"].value_counts(ascending=False).index

In [38]:
list_1 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==0 or i%16==15]
list_2 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==1 or i%16==14]
list_3 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==2 or i%16==13]
list_4 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==3 or i%16==12]
list_5 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==4 or i%16==11]
list_6 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==5 or i%16==10]
list_7 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==6 or i%16==9]
list_8 = [keyword_data_desc_order[i] for i in range(len(keyword_data_desc_order)) if i%16==7 or i%16==8]

In [39]:
shard_1 = dataset.loc[dataset["keywordId"].isin(list_1)]
shard_2 = dataset.loc[dataset["keywordId"].isin(list_2)]
shard_3 = dataset.loc[dataset["keywordId"].isin(list_3)]
shard_4 = dataset.loc[dataset["keywordId"].isin(list_4)]
shard_5 = dataset.loc[dataset["keywordId"].isin(list_5)]
shard_6 = dataset.loc[dataset["keywordId"].isin(list_6)]
shard_7 = dataset.loc[dataset["keywordId"].isin(list_7)]
shard_8 = dataset.loc[dataset["keywordId"].isin(list_8)]

In [40]:
shard_1.to_csv("shard_1_small.csv", index=False)
shard_2.to_csv("shard_2_small.csv", index=False)
shard_3.to_csv("shard_3_small.csv", index=False)
shard_4.to_csv("shard_4_small.csv", index=False)
shard_5.to_csv("shard_5_small.csv", index=False)
shard_6.to_csv("shard_6_small.csv", index=False)
shard_7.to_csv("shard_7_small.csv", index=False)
shard_8.to_csv("shard_8_small.csv", index=False)

In [41]:
s3 = boto3.resource('s3')
s3.meta.client.upload_file("shard_1_small.csv", 'training-data-lstm', 'sharded_data_small/shard_1.csv')

In [42]:
s3.meta.client.upload_file("shard_2_small.csv", 'training-data-lstm', 'sharded_data_small/shard_2.csv')
s3.meta.client.upload_file("shard_3_small.csv", 'training-data-lstm', 'sharded_data_small/shard_3.csv')
s3.meta.client.upload_file("shard_4_small.csv", 'training-data-lstm', 'sharded_data_small/shard_4.csv')
s3.meta.client.upload_file("shard_5_small.csv", 'training-data-lstm', 'sharded_data_small/shard_5.csv')
s3.meta.client.upload_file("shard_6_small.csv", 'training-data-lstm', 'sharded_data_small/shard_6.csv')
s3.meta.client.upload_file("shard_7_small.csv", 'training-data-lstm', 'sharded_data_small/shard_7.csv')
s3.meta.client.upload_file("shard_8_small.csv", 'training-data-lstm', 'sharded_data_small/shard_8.csv')

In [79]:
%%writefile preprocess.py 

import os
import sys
import boto3
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view

from pathlib import Path

HISTORICAL_DATA_WINDOW = 14
FUTURE_PREDICTION_WINDOW = 3

save_bucket_name = ""

def create_training_frames(partition, keywordId):

  partition.sort_values(by=["date"], inplace=True)
  split_index = keywordId
    
  train_frames_X, train_frames_Y = [], []
  test_frames_X, test_frames_Y = [], []
  train_frames_embed, test_frames_embed = [], []
  train_frames_decoder_input, test_frames_decoder_input = [], []

  train_size = (int)(len(partition) * 0.7)
  train_data = partition[:train_size]
  test_data = partition[train_size-HISTORICAL_DATA_WINDOW-FUTURE_PREDICTION_WINDOW+1:]
  data_columns = partition.columns

  train_data_arr, test_data_arr = np.array(train_data), np.array(test_data)
  if train_data_arr.shape[0] < HISTORICAL_DATA_WINDOW or test_data_arr.shape[0] < HISTORICAL_DATA_WINDOW:
    return 0
    
  train_data_frame_X = sliding_window_view(train_data_arr, window_shape = (HISTORICAL_DATA_WINDOW, train_data_arr.shape[1]))
  test_data_frame_X = sliding_window_view(test_data_arr, window_shape = (HISTORICAL_DATA_WINDOW, test_data_arr.shape[1]))
  train_data_frame_X = np.squeeze(train_data_frame_X)[:-FUTURE_PREDICTION_WINDOW]
  test_data_frame_X = np.squeeze(test_data_frame_X)[:-FUTURE_PREDICTION_WINDOW]
  train_data_frames_X, test_data_frames_X = train_data_frame_X.copy(), test_data_frame_X.copy()
  for frame in train_data_frames_X:
    train_frames_X.append(pd.DataFrame(frame, columns=data_columns))
  for frame in test_data_frames_X:
    test_frames_X.append(pd.DataFrame(frame, columns=data_columns))

  train_data_frame_Y = sliding_window_view(train_data_arr, window_shape = (FUTURE_PREDICTION_WINDOW, train_data_arr.shape[1]))
  test_data_frame_Y = sliding_window_view(test_data_arr, window_shape = (FUTURE_PREDICTION_WINDOW, test_data_arr.shape[1]))
  train_data_frame_Y = np.squeeze(train_data_frame_Y)[HISTORICAL_DATA_WINDOW:]
  test_data_frame_Y = np.squeeze(test_data_frame_Y)[HISTORICAL_DATA_WINDOW:]
  train_data_frames_Y, test_data_frames_Y = train_data_frame_Y.copy(), test_data_frame_Y.copy()
  for frame in train_data_frames_Y:
    train_frames_Y.append(pd.DataFrame(frame, columns=data_columns))
  for frame in test_data_frames_Y:
    test_frames_Y.append(pd.DataFrame(frame, columns=data_columns))
    
  embedding_columns = ["keyword_length", "keyword_num_words", "budget", "matchType", "country_code", "campaign_type", "targeting_type", "budget_type", "adFormat", "tactic", "costType"]
  
  for i in range(len(train_frames_X)):
    train_frames_embed.append(train_frames_X[i].loc[0][embedding_columns])
    train_frames_X[i].drop(columns=embedding_columns, inplace=True)

  for i in range(len(test_frames_X)):
    test_frames_embed.append(test_frames_X[i].loc[0][embedding_columns])
    test_frames_X[i].drop(columns=embedding_columns, inplace=True)
    
  for i in range(len(train_frames_X)):
    train_frames_X[i].drop(columns=["keywordId", "date"], inplace=True)
    train_frames_Y[i].drop(columns=embedding_columns, inplace=True)
    train_frames_Y[i].drop(columns=["keywordId", "date", "year", "month", "day", "dayoftheweek", "clicks", "impressions", "orders", "campaign_sales_perc", "campaign_spend_perc", "account_sales_perc", "account_spend_perc"], inplace=True)

  for i in range(len(test_frames_X)):
    test_frames_X[i].drop(columns=["keywordId", "date"], inplace=True)
    test_frames_Y[i].drop(columns=embedding_columns, inplace=True)
    test_frames_Y[i].drop(columns=["keywordId", "date", "year", "month", "day", "dayoftheweek", "clicks", "impressions", "orders", "campaign_sales_perc", "campaign_spend_perc", "account_sales_perc", "account_spend_perc"], inplace=True)
  
  for i in range(len(train_frames_Y)):
      train_frames_decoder_input.append(train_frames_Y[i]["cpc"])
      train_frames_Y[i].drop(columns=["cpc"], inplace=True)

  for i in range(len(test_frames_Y)):
      test_frames_decoder_input.append(test_frames_Y[i]["cpc"])
      test_frames_Y[i].drop(columns=["cpc"], inplace=True)
    
  train_frames_X, train_frames_Y = np.array(train_frames_X), np.array(train_frames_Y)
  test_frames_X, test_frames_Y = np.array(test_frames_X), np.array(test_frames_Y)
  train_frames_embed, test_frames_embed = np.array(train_frames_embed), np.array(test_frames_embed)
  train_frames_decoder_input, test_frames_decoder_input = np.array(train_frames_decoder_input), np.array(test_frames_decoder_input)
  
  train_frames_x_dir = "./train_frames_x"
  test_frames_x_dir = "./test_frames_x"
  train_frames_y_dir = "./train_frames_y"
  test_frames_y_dir = "./test_frames_y"
  train_frames_embed_dir = "./train_frames_embed"
  test_frames_embed_dir = "./test_frames_embed"
  train_frames_decoder_input_dir = "./train_frames_decoder_input"
  test_frames_decoder_input_dir = "./test_frames_decoder_input"
  
  if not os.path.exists(train_frames_x_dir):
    os.makedirs(train_frames_x_dir)
  if not os.path.exists(test_frames_x_dir):
    os.makedirs(test_frames_x_dir)
  if not os.path.exists(train_frames_y_dir):
    os.makedirs(train_frames_y_dir)
  if not os.path.exists(test_frames_y_dir):
    os.makedirs(test_frames_y_dir)
  if not os.path.exists(train_frames_embed_dir):
    os.makedirs(train_frames_embed_dir)
  if not os.path.exists(test_frames_embed_dir):
    os.makedirs(test_frames_embed_dir)
  if not os.path.exists(train_frames_decoder_input_dir):
    os.makedirs(train_frames_decoder_input_dir)
  if not os.path.exists(test_frames_decoder_input_dir):
    os.makedirs(test_frames_decoder_input_dir)
    
  np.save(os.path.join(train_frames_x_dir, f"par_{split_index}.npy"), train_frames_X)
  np.save(os.path.join(test_frames_x_dir, f"par_{split_index}.npy"), test_frames_X)
  np.save(os.path.join(train_frames_y_dir, f"par_{split_index}.npy"), train_frames_Y)
  np.save(os.path.join(test_frames_y_dir, f"par_{split_index}.npy"), test_frames_Y)
  np.save(os.path.join(train_frames_embed_dir, f"par_{split_index}.npy"), train_frames_X)
  np.save(os.path.join(test_frames_embed_dir, f"par_{split_index}.npy"), test_frames_X)
  np.save(os.path.join(train_frames_decoder_input_dir, f"par_{split_index}.npy"), train_frames_Y)
  np.save(os.path.join(test_frames_decoder_input_dir, f"par_{split_index}.npy"), test_frames_Y)

  return 0

if __name__=="__main__":
    for file in Path("/opt/ml/processing/input/").rglob('*.csv'):
      file_path = str(file)
      dataset = pd.read_csv(file_path)
      for keyword in dataset["keywordId"].unique():
        val = create_training_frames(dataset.loc[dataset["keywordId"]==keyword], keyword)

Overwriting preprocess.py


In [68]:
import logging
import sagemaker
from time import gmtime, strftime

sagemaker_logger = logging.getLogger("sagemaker")
sagemaker_logger.setLevel(logging.INFO)
sagemaker_logger.addHandler(logging.StreamHandler())

sagemaker_session = sagemaker.Session(boto3.session.Session(region_name='eu-north-1'))
role = sagemaker.get_execution_role()

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [64]:
sagemaker_session.boto_region_name

'eu-north-1'

In [76]:
!sudo chmod 777 lost+found

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor
import timeit

est_cls = sagemaker.sklearn.estimator.SKLearn
framework_version_str = "0.20.0"

script_processor = FrameworkProcessor(
    role=role,
    instance_type='ml.t3.medium',
    instance_count=4,
    base_job_name = 'preprocessing',
    sagemaker_session = sagemaker_session,
    estimator_cls=est_cls,
    framework_version=framework_version_str,
)


start = timeit.default_timer()
source_folder = "s3://training-data-lstm/sharded_data_small/" 

# sklearn_processor = SKLearnProcessor(framework_version='0.23-1',
#                                      role=role,
#                                      instance_type='ml.t3.medium',
#                                      instance_count=4,
#                                      base_job_name = 'preprocessing',
#                                      sagemaker_session = sagemaker_session
#                                     )

script_processor.run(
    code='preprocess.py',
    source_dir = "/home/ec2-user/SageMaker/",
    inputs=[
        ProcessingInput(
            source=source_folder,
            s3_data_distribution_type='ShardedByS3Key',
            destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(
          source='/opt/ml/processing/output/', 
          destination='s3://training-data-lstm/processed-sharded-data/'
        )
    ],
)

stop = timeit.default_timer()

print('Time: ', stop - start) 

Using provided s3_resource


Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker/ to s3://sagemaker-eu-north-1-321097665711/preprocessing-2023-07-11-14-52-48-747/source/sourcedir.tar.gz
INFO:sagemaker.processing:U

.............................................................................[34mFound existing installation: typing 3.7.4.3[0m
[34mUninstalling typing-3.7.4.3:
  Successfully uninstalled typing-3.7.4.3[0m
[34mCollecting numpy==1.20
  Downloading numpy-1.20.0-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.3/15.3 MB 17.7 MB/s eta 0:00:00[0m
[34mInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:[0m
[34m      Successfully uninstalled numpy-1.19.5[0m
[34mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.[0m
[34msagemaker-containers 2.8.6.post2 requires typing, which is not installed.[0m
[34msagemaker-sklearn-container 1.0 requires numpy==1.19.5, but you have numpy 1.20.0 which is incompatible.[0m
[34mSuccessfully

KeyboardInterrupt: 

In [74]:
import os
os.getcwd()

'/home/ec2-user/SageMaker'

In [52]:
!export AWS_DEFAULT_REGION=eu-north-1

In [60]:
role.region

AttributeError: 'str' object has no attribute 'region'