In [35]:
import boto3
import logging
import sagemaker

from sagemaker.tensorflow import TensorFlowProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

In [36]:
sagemaker_logger = logging.getLogger("sagemaker")
sagemaker_logger.setLevel(logging.INFO)
sagemaker_logger.addHandler(logging.StreamHandler())

sagemaker_session = sagemaker.Session(boto3.session.Session(region_name='eu-north-1'))
role = sagemaker.get_execution_role()
print(role)

arn:aws:iam::321097665711:role/service-role/AmazonSageMaker-ExecutionRole-20230629T130572


In [102]:
%%writefile training.py

import os
import glob
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.optimizers import Adam

tf.random.set_seed(1)
np.random.seed(1)

import warnings
warnings.filterwarnings("ignore")

class DataGenerator(keras.utils.Sequence):
    def __init__(self, file_path, batch_size=32, shuffle=True):
        from s3fs.core import S3FileSystem
        self.s3 = S3FileSystem()
        self.file_path = file_path
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_types = ["train_frames_x", "train_frames_y", "train_frames_embed", "train_frames_decoder_input"]
        self.indices = []
        for i in range(1, 2):
            for j in range(1, 2):
                self.indices += [f"_{i}_{j}"]
        self.on_epoch_end()
        self.total_samples = 0
        self.num_samples_per_index = {}
        for index in self.indices:
            data = np.load(self.s3.open(os.path.join(self.file_path, self.file_types[3] + index + ".pkl")), allow_pickle=True)
            self.total_samples += data.shape[0]
            self.num_samples_per_index[index] = data.shape[0]

    def __len__(self):
        return int(np.floor(self.total_samples/self.batch_size))

    def __getitem__(self, index):

        # Generate indexes of the batch
        lower_idx = index * self.batch_size
        upper_idx = (index+1) * self.batch_size - 1
        
        start = 0
        end = 0
        begin = 0
        finish = 0
        starting_idx = ""
        ending_idx = ""
        
        for idx in self.indices:
            end = start + self.num_samples_per_index[idx]
            if lower_idx < end and lower_idx >= start:
                starting_idx = idx
                begin = lower_idx-start
            if upper_idx < end and upper_idx >= start:
                ending_idx = idx
                finish = upper_idx-start
                break
            start = end
        
        train_data_x, train_data_embed, train_data_decoder_input, train_data_y = [], [], [], []
        started = False
        for idx in self.indices:
            if not started and idx != starting_idx:
                continue
            if idx == starting_idx:
                if idx == ending_idx:
                    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    break
                else:
                    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [self.s3.open(np.load(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
            elif idx != ending_idx:
                train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)]
                train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)]
                train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)]
                train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)]
            elif idx == ending_idx:
                train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[:finish+1]]
        
        train_data_x, train_data_y, train_data_embed, train_data_decoder_input = np.concatenate(train_data_x), np.concatenate(train_data_y), np.concatenate(train_data_embed), np.concatenate(train_data_decoder_input)
        
        return [train_data_x[:, :, 1:], train_data_embed[:, 1, 1:], train_data_decoder_input[:, :, 2:]], train_data_y[:, :, 1:]

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

def load_data(train_data_path):
    
    train_data = {}
    test_data = {}
    
    file_types = ["train_frames_x_1", "train_frames_y_1", "train_frames_embed_1", "train_frames_decoder_input_1"]
    for file_type in file_types:
        
        data = []
        for file in sorted(glob.glob(os.path.join(train_data_path, file_type+"*"))):
            data.append(np.load(str(file), allow_pickle=True))
        data = np.array(data)
        train_data[file_type] = data
        
        data = []
        for file in sorted(glob.glob(os.path.join("test" + train_data_path[5:], file_type+"*"))):
            data.append(np.load(str(file), allow_pickle=True))
        data = np.array(data)
        test_data[file_type] = data
        
    return train_data, test_data

def define_model(num_embedding_features, num_historical_features, historical_data_window, future_prediction_window):
    
    encoder_input = keras.Input(shape=(historical_data_window, num_historical_features))
    encoder_lstm1 = layers.LSTM(32, return_sequences=True)(encoder_input)
    encoder_lstm1 = layers.ReLU()(encoder_lstm1)
    batch_norm1 = layers.BatchNormalization()(encoder_lstm1)
    encoder_lstm2 = layers.LSTM(64, return_sequences=True)(batch_norm1)
    encoder_lstm2 = layers.ReLU()(encoder_lstm2)
    batch_norm2 = layers.BatchNormalization()(encoder_lstm2)
    encoder_output = layers.LSTM(32)(batch_norm2)
    encoder_output = layers.ReLU()(encoder_output)

    embedding_input = keras.Input(shape=(num_embedding_features, ))
    embedding_layer1 = layers.Dense(32)(embedding_input)
    embedding_layer1 = layers.ReLU()(embedding_layer1)
    batch_norm3 = layers.BatchNormalization()(embedding_layer1)
    embedding_layer2 = layers.Dense(64)(batch_norm3)
    embedding_layer2 = layers.ReLU()(embedding_layer2)
    batch_norm4 = layers.BatchNormalization()(embedding_layer2)
    embedding_output = layers.Dense(32)(batch_norm4)
    embedding_output = layers.ReLU()(embedding_output)

    embedding_encoder_concatenate = layers.Concatenate()([encoder_output, embedding_output])
    embedding_encoder_concatenate = layers.Dense(32)(embedding_encoder_concatenate)
    embedding_encoder_concatenate = layers.ReLU()(embedding_encoder_concatenate)
    future_cpc = keras.Input(shape=(FUTURE_PREDICTION_WINDOW, ))

    decoder_input = layers.RepeatVector(future_prediction_window)(embedding_encoder_concatenate)
    future_cpc = layers.Reshape((-1, 1))(future_cpc)
    decoder_input = layers.Concatenate()([decoder_input, future_cpc])
    decoder_lstm1 = layers.LSTM(32, return_sequences=True)(decoder_input)
    decoder_lstm1 = layers.ReLU()(decoder_lstm1)
    batch_norm5 = layers.BatchNormalization()(decoder_lstm1)
    decoder_lstm2 = layers.LSTM(16, return_sequences=True)(batch_norm5)
    decoder_lstm2 = layers.ReLU()(decoder_lstm2)
    batch_norm6 = layers.BatchNormalization()(decoder_lstm2)
    decoder_output = layers.LSTM(2, return_sequences=True)(batch_norm6)
    decoder_output_fin = layers.ReLU()(decoder_output)

    # Create the model
    ED_lstm_model = tf.keras.Model(inputs=[encoder_input, embedding_input, future_cpc], outputs=decoder_output_fin)
    ED_lstm_model.compile(optimizer="adam", loss='mean_squared_error')
    
    return ED_lstm_model

def prepare_data(train_data, test_data):
    
    for key, value in train_data.items():
        train_data[key] = np.nan_to_num(np.array(value).astype(np.float32)).astype(np.float32)
        
    for key, value in test_data.items():
        test_data[key] = np.nan_to_num(np.array(value).astype(np.float32)).astype(np.float32)
        
    return train_data, test_data

def train_model(epochs, batchsize, model_path, model, train_generator, valid_generator):
    
    history = model.fit_generator(
                         generator = train_generator,
                         validation_data = valid_generator,
                         use_multiprocessing = True
                         workers=2,
                         verbose=1,
                         callbacks=[
                            tf.keras.callbacks.EarlyStopping(
                                monitor='val_loss',
                                min_delta=0,
                                patience=10,
                                verbose=1,
                                mode='min'
                            ),
                            tf.keras.callbacks.ModelCheckpoint(
                                model_path,
                                monitor='val_loss',
                                save_best_only=True,
                                mode='min',
                                verbose=1
                            )
                         ],
                     )
    return history
                         
if __name__=="__main__":
    
    path = "/opt/ml/processing/input/data/"
    model_path = "/opt/ml/processing/output/trained_model.ckpt"
    epochs = 10
    batch_size = 32

    train_data_gen = DataGenerator("s3://training-data-lstm/processed_data_1/", 32, True, "train")
    val_data_gen = DataGenerator("s3://training-data-lstm/processed_data_1/", 32, True, "test")
    
    num_embedding_features = 14
    num_historical_features = 14
    historical_data_window = 14
    future_prediction_window = 3
                         
    model = define_model(num_embedding_features, num_historical_features, historical_data_window, future_prediction_window)
    history = train_model(epochs, batch_size, model_path, model, train_data_gen, val_data_gen)
    
                        

Overwriting training.py


In [9]:
import os
import glob
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.optimizers import Adam

tf.random.set_seed(1)
np.random.seed(1)

import warnings
warnings.filterwarnings("ignore")

class DataGenerator(keras.utils.Sequence):
    def __init__(self, file_path, batch_size=32, shuffle=True, train_or_valid="train"):
        from s3fs.core import S3FileSystem
        self.s3 = S3FileSystem()
        self.file_path = file_path
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_types = [f"{train_or_valid}_frames_x", f"{train_or_valid}_frames_y", f"{train_or_valid}_frames_embed", f"{train_or_valid}_frames_decoder_input"]
        self.indices = []
        for i in range(1, 2):
            for j in range(1, 2):
                self.indices += [f"_{i}_{j}"]
        self.on_epoch_end()
        self.total_samples = 0
        self.num_samples_per_index = {}
        for index in self.indices:
            data = np.load(self.s3.open(os.path.join(self.file_path, self.file_types[3] + index + ".pkl")), allow_pickle=True)
            self.total_samples += data.shape[0]
            self.num_samples_per_index[index] = data.shape[0]

    def __len__(self):
        return int(np.floor(self.total_samples/self.batch_size))

    def __getitem__(self, index):

        # Generate indexes of the batch
        lower_idx = index * self.batch_size
        upper_idx = (index+1) * self.batch_size - 1
        
        start = 0
        end = 0
        begin = 0
        finish = 0
        starting_idx = ""
        ending_idx = ""
        
        for idx in self.indices:
            end = start + self.num_samples_per_index[idx]
            if lower_idx < end and lower_idx >= start:
                starting_idx = idx
                begin = lower_idx-start
            if upper_idx < end and upper_idx >= start:
                ending_idx = idx
                finish = upper_idx-start
                break
            start = end
        
        train_data_x, train_data_embed, train_data_decoder_input, train_data_y = [], [], [], []
        started = False
        for idx in self.indices:
            if not started and idx != starting_idx:
                continue
            if idx == starting_idx:
                if idx == ending_idx:
                    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    break
                else:
                    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [self.s3.open(np.load(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
            elif idx != ending_idx:
                train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)]
                train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)]
                train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)]
                train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)]
            elif idx == ending_idx:
                train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[:finish+1]]
        
        train_data_x, train_data_y, train_data_embed, train_data_decoder_input = np.concatenate(train_data_x), np.concatenate(train_data_y), np.concatenate(train_data_embed), np.concatenate(train_data_decoder_input)
        
        return [np.nan_to_num(train_data_x[:, :, 1:]).astype(np.float32), np.nan_to_num(np.squeeze(train_data_embed[:, :, 1:])).astype(np.float32), np.nan_to_num(train_data_decoder_input[:, :, 2:]).astype(np.float32)], np.nan_to_num(train_data_y[:, :, 1:]).astype(np.float32)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

def load_data(train_data_path):
    
    train_data = {}
    test_data = {}
    
    file_types = ["train_frames_x_1", "train_frames_y_1", "train_frames_embed_1", "train_frames_decoder_input_1"]
    for file_type in file_types:
        
        data = []
        for file in sorted(glob.glob(os.path.join(train_data_path, file_type+"*"))):
            data.append(np.load(str(file), allow_pickle=True))
        data = np.array(data)
        train_data[file_type] = data
        
        data = []
        for file in sorted(glob.glob(os.path.join("test" + train_data_path[5:], file_type+"*"))):
            data.append(np.load(str(file), allow_pickle=True))
        data = np.array(data)
        test_data[file_type] = data
        
    return train_data, test_data

def define_model(num_embedding_features, num_historical_features, historical_data_window, future_prediction_window):
    
    encoder_input = keras.Input(shape=(historical_data_window, num_historical_features))
    encoder_lstm1 = layers.LSTM(32, return_sequences=True)(encoder_input)
    encoder_lstm1 = layers.ReLU()(encoder_lstm1)
    batch_norm1 = layers.BatchNormalization()(encoder_lstm1)
    encoder_lstm2 = layers.LSTM(64, return_sequences=True)(batch_norm1)
    encoder_lstm2 = layers.ReLU()(encoder_lstm2)
    batch_norm2 = layers.BatchNormalization()(encoder_lstm2)
    encoder_output = layers.LSTM(32)(batch_norm2)
    encoder_output = layers.ReLU()(encoder_output)

    embedding_input = keras.Input(shape=(num_embedding_features, ))
    embedding_layer1 = layers.Dense(32)(embedding_input)
    embedding_layer1 = layers.ReLU()(embedding_layer1)
    batch_norm3 = layers.BatchNormalization()(embedding_layer1)
    embedding_layer2 = layers.Dense(64)(batch_norm3)
    embedding_layer2 = layers.ReLU()(embedding_layer2)
    batch_norm4 = layers.BatchNormalization()(embedding_layer2)
    embedding_output = layers.Dense(32)(batch_norm4)
    embedding_output = layers.ReLU()(embedding_output)

    embedding_encoder_concatenate = layers.Concatenate()([encoder_output, embedding_output])
    embedding_encoder_concatenate = layers.Dense(32)(embedding_encoder_concatenate)
    embedding_encoder_concatenate = layers.ReLU()(embedding_encoder_concatenate)
    future_cpc = keras.Input(shape=(future_prediction_window, ))

    decoder_input = layers.RepeatVector(future_prediction_window)(embedding_encoder_concatenate)
    future_cpc = layers.Reshape((-1, 1))(future_cpc)
    decoder_input = layers.Concatenate()([decoder_input, future_cpc])
    decoder_lstm1 = layers.LSTM(32, return_sequences=True)(decoder_input)
    decoder_lstm1 = layers.ReLU()(decoder_lstm1)
    batch_norm5 = layers.BatchNormalization()(decoder_lstm1)
    decoder_lstm2 = layers.LSTM(16, return_sequences=True)(batch_norm5)
    decoder_lstm2 = layers.ReLU()(decoder_lstm2)
    batch_norm6 = layers.BatchNormalization()(decoder_lstm2)
    decoder_output = layers.LSTM(2, return_sequences=True)(batch_norm6)
    decoder_output_fin = layers.ReLU()(decoder_output)

    # Create the model
    ED_lstm_model = tf.keras.Model(inputs=[encoder_input, embedding_input, future_cpc], outputs=decoder_output_fin)
    ED_lstm_model.compile(optimizer="adam", loss='mean_squared_error')
    
    return ED_lstm_model

def prepare_data(train_data, test_data):
    
    for key, value in train_data.items():
        train_data[key] = np.nan_to_num(np.array(value).astype(np.float32)).astype(np.float32)
        
    for key, value in test_data.items():
        test_data[key] = np.nan_to_num(np.array(value).astype(np.float32)).astype(np.float32)
        
    return train_data, test_data

def train_model(epochs, batchsize, model_path, model, train_generator, valid_generator):
    
    history = model.fit_generator(
                         generator = train_generator,
                         validation_data = valid_generator,
                         use_multiprocessing = True,
                         workers=2,
                         verbose=1,
                         callbacks=[
                            tf.keras.callbacks.EarlyStopping(
                                monitor='val_loss',
                                min_delta=0,
                                patience=10,
                                verbose=1,
                                mode='min'
                            ),
                            tf.keras.callbacks.ModelCheckpoint(
                                model_path,
                                monitor='val_loss',
                                save_best_only=True,
                                mode='min',
                                verbose=1
                            )
                         ],
                     )
    return history

In [None]:
model_path = "./trained_model.ckpt"
epochs = 1
batch_size = 32

tf.keras.backend.set_image_data_format("channels_last")

train_data_gen = DataGenerator("s3://training-data-lstm/processed_data_1/", 32, True, "train")
val_data_gen = DataGenerator("s3://training-data-lstm/processed_data_1/", 32, True, "train")

num_embedding_features = 14
num_historical_features = 14
historical_data_window = 14
future_prediction_window = 3

model = define_model(num_embedding_features, num_historical_features, historical_data_window, future_prediction_window)
history = train_model(epochs, batch_size, model_path, model, train_data_gen, val_data_gen)

    5/10409 [..............................] - ETA: 68:05:59 - loss: 6.3599

Process Keras_worker_ForkPoolWorker-2:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/utils/data_utils.py", line 647, in get_index
    return _SHARED_SEQUENCES[uid][i]
  File "/tmp/ipykernel_11396/3441579348.py", line 72, in __getitem__
    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/numpy/

In [100]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 14, 14)]     0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 14)]         0           []                               
                                                                                                  
 lstm_10 (LSTM)                 (None, 14, 32)       6016        ['input_8[0][0]']                
                                                                                                  
 dense_8 (Dense)                (None, 32)           480         ['input_9[0][0]']                
                                                                                            

In [None]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_types = ["train_frames_x", "train_frames_y", "train_frames_embed", "train_frames_decoder_input"]
        self.indices = []
        for i in range(1, 2):
            for j in range(1, 2):
                self.indices += [f"_{i}_{j}"]
        self.on_epoch_end()
        self.total_samples = 0
        self.num_samples_per_index = {}
        for index in self.indices:
            data = np.load(self.file_types[3] + index + ".pkl", allow_pickle=True)
            self.total_samples += data.shape[0]
            self.num_samples_per_index[index] = data.shape[0]

    def __len__(self):
        return int(np.floor(self.total_samples/self.batch_size))

    def __getitem__(self, index):

        # Generate indexes of the batch
        lower_idx = index * self.batch_size
        upper_idx = (index+1) * self.batch_size - 1
        
        start = 0
        end = 0
        begin = 0
        finish = 0
        starting_idx = ""
        ending_idx = ""
        
        for idx in self.indices:
            end = start + self.num_samples_per_index[idx]
            if lower_idx < end and lower_idx >= start:
                starting_idx = idx
                begin = lower_idx-start
            if upper_idx < end and upper_idx >= start:
                ending_idx = idx
                finish = upper_idx-start
                break
            start = end
        
        train_data_x, train_data_embed, train_data_decoder_input, train_data_y = [], [], [], []
        started = False
        for idx in self.indices:
            if not started and idx != starting_idx:
                continue
            if idx == starting_idx:
                if idx == ending_idx:
                    train_data_x += [np.load("train_frames_x"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load("train_frames_y"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load("train_frames_embed"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [np.load("train_frames_decoder_input"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    break
                else:
                    train_data_x += [np.load("train_frames_x"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load("train_frames_y"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load("train_frames_embed"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [np.load("train_frames_decoder_input"+idx+".pkl", allow_pickle=True)[begin:finish+1]]
            elif idx != ending_idx:
                train_data_x += [np.load("train_frames_x"+idx+".pkl", allow_pickle=True)]
                train_data_y += [np.load("train_frames_y"+idx+".pkl", allow_pickle=True)]
                train_data_embed += [np.load("train_frames_embed"+idx+".pkl", allow_pickle=True)]
                train_data_decoder_input += [np.load("train_frames_decoder_input"+idx+".pkl", allow_pickle=True)]
            elif idx == ending_idx:
                train_data_x += [np.load("train_frames_x"+idx+".pkl", allow_pickle=True)[:finish+1]]
                train_data_y += [np.load("train_frames_y"+idx+".pkl", allow_pickle=True)[:finish+1]]
                train_data_embed += [np.load("train_frames_embed"+idx+".pkl", allow_pickle=True)[:finish+1]]
                train_data_decoder_input += [np.load("train_frames_decoder_input"+idx+".pkl", allow_pickle=True)[:finish+1]]
        
        train_data_x, train_data_y, train_data_embed, train_data_decoder_input = pd.concat(train_data_x), pd.concat(train_data_y), pd.concat(train_data_embed), pd.concat(train_data_decoder_input)
        
        return [train_data_x, train_data_embed, train_data_decoder_input], train_data_y

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

In [25]:
tp = TensorFlowProcessor(
    framework_version='2.3',
    role=role,
    instance_type='ml.t3.2xlarge',
    instance_count=1,
    base_job_name='frameworkprocessor-TF',
    py_version='py37',
    sagemaker_session = sagemaker_session
)

#Run the processing job
tp.run(
    code='training.py',
    source_dir='/home/ec2-user/SageMaker',
    inputs=[
        ProcessingInput(
            source='s3://training-data-lstm/processed_data/',
            destination='/opt/ml/processing/input/data'
        )
    ],
    outputs=[
        ProcessingOutput(
            source='/opt/ml/processing/output',
            destination='s3://training-data-lstm/model_artifacts/'
        )
    ]
)

image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


Uploaded /home/ec2-user/SageMaker to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/sourcedir.tar.gz
Uploaded /home/ec2-user/SageMaker to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/sourcedir.tar.gz
INFO:sagemaker.processing:Uploaded /home/ec2-user/SageMaker to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/sourcedir.tar.gz
runproc.sh uploaded to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/runproc.sh
runproc.sh uploaded to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/runproc.sh
runproc.sh uploaded to s3://sagemaker-eu-north-1-321097665711/frameworkprocessor-TF-2023-07-12-18-47-38-194/source/runproc.sh
INFO:sagemak

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateProcessingJob operation: The account-level service limit 'ml.t3.2xlarge for processing job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

In [8]:
import os
import numpy as np

import keras

class DataGenerator(keras.utils.Sequence):
    def __init__(self, file_path, batch_size=32, shuffle=True):
        from s3fs.core import S3FileSystem
        self.s3 = S3FileSystem()
        self.file_path = file_path
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_types = ["train_frames_x", "train_frames_y", "train_frames_embed", "train_frames_decoder_input"]
        self.indices = []
        for i in range(1, 2):
            for j in range(1, 2):
                self.indices += [f"_{i}_{j}"]
        self.on_epoch_end()
        self.total_samples = 0
        self.num_samples_per_index = {}
        for index in self.indices:
            data = np.load(self.s3.open(os.path.join(self.file_path, self.file_types[3] + index + ".pkl")), allow_pickle=True)
            self.total_samples += data.shape[0]
            self.num_samples_per_index[index] = data.shape[0]

    def __len__(self):
        return int(np.floor(self.total_samples/self.batch_size))

    def __getitem__(self, index):

        # Generate indexes of the batch
        lower_idx = index * self.batch_size
        upper_idx = (index+1) * self.batch_size - 1
        
        start = 0
        end = 0
        begin = 0
        finish = 0
        starting_idx = ""
        ending_idx = ""
        
        for idx in self.indices:
            end = start + self.num_samples_per_index[idx]
            if lower_idx < end and lower_idx >= start:
                starting_idx = idx
                begin = lower_idx-start
            if upper_idx < end and upper_idx >= start:
                ending_idx = idx
                finish = upper_idx-start
                break
            start = end
        
        train_data_x, train_data_embed, train_data_decoder_input, train_data_y = [], [], [], []
        started = False
        for idx in self.indices:
            if not started and idx != starting_idx:
                continue
            if idx == starting_idx:
                if idx == ending_idx:
                    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    break
                else:
                    train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
                    train_data_decoder_input += [self.s3.open(np.load(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[begin:finish+1]]
            elif idx != ending_idx:
                train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)]
                train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)]
                train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)]
                train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)]
            elif idx == ending_idx:
                train_data_x += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_x"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_y += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_y"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_embed += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_embed"+idx+".pkl")), allow_pickle=True)[:finish+1]]
                train_data_decoder_input += [np.load(self.s3.open(os.path.join(self.file_path, "train_frames_decoder_input"+idx+".pkl")), allow_pickle=True)[:finish+1]]
        
        train_data_x, train_data_y, train_data_embed, train_data_decoder_input = np.concatenate(train_data_x), np.concatenate(train_data_y), np.concatenate(train_data_embed), np.concatenate(train_data_decoder_input)
        
        return [train_data_x[:, :, 1:], train_data_embed, train_data_decoder_input], train_data_y[:, :, 1:]

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

In [None]:
import numpy as np

In [31]:
decoder_input = np.load(s3.open(os.path.join("s3://training-data-lstm/processed_data_1/", "train_frames_embed_1_1"+".pkl")), allow_pickle=True)

In [33]:
decoder_input.shape

(333100, 11)

In [10]:
train_data_gen = DataGenerator("s3://training-data-lstm/processed_data_1/", 32)

In [11]:
X, y = train_data_gen.__getitem__(1)

KeyboardInterrupt: 

In [91]:
X[2].shape

(32, 3, 3)

In [85]:
y

array([[[14261286, 0.0, 0.0],
        [14261019, 0.0, 0.0],
        [14260726, 0.0, 0.0]],

       [[15709598, 0.0, 0.0],
        [15708388, 0.0, 0.0],
        [15707137, 0.0, 0.0]],

       [[15708388, 0.0, 0.0],
        [15707137, 0.0, 0.0],
        [15705882, 0.0, 0.0]],

       [[26532881, 0.0, 0.0],
        [26531270, 0.55, 0.0],
        [26529737, 0.0, 0.0]],

       [[26531270, 0.55, 0.0],
        [26529737, 0.0, 0.0],
        [26528007, 0.0, 0.0]],

       [[26529737, 0.0, 0.0],
        [26528007, 0.0, 0.0],
        [26526002, 0.49, 0.0]],

       [[26528007, 0.0, 0.0],
        [26526002, 0.49, 0.0],
        [26523999, 0.26, 0.0]],

       [[26526002, 0.49, 0.0],
        [26523999, 0.26, 0.0],
        [26521961, 0.3, 0.0]],

       [[26523999, 0.26, 0.0],
        [26521961, 0.3, 0.0],
        [26520276, 0.74, 0.0]],

       [[26521961, 0.3, 0.0],
        [26520276, 0.74, 0.0],
        [26518934, 0.0, 0.0]],

       [[18093645, 0.0, 0.0],
        [18093619, 0.0, 0.0],
        [1

In [7]:
from s3fs.core import S3FileSystem
import boto3
import io
import numpy as np
import pickle
s3 = S3FileSystem()
s3_client = boto3.client("s3")
file_types = ["x", "y", "embed", "decoder_input"]
for file_type in file_types:
    for i in range(1, 9):
        for j in range(1, 5):
            print(f"{file_type}_{i}_{j}")
            test_arr = np.load(s3.open(os.path.join("s3://training-data-lstm/processed_data/", f"test_frames_{file_type}_{i}_{j}.pkl")), allow_pickle=True)
            test_arr1 = np.concatenate([sample for sample in test_arr], axis=0)
            my_array_data = io.BytesIO()
            pickle.dump(test_arr1, my_array_data)
            my_array_data.seek(0)
            s3_client.upload_fileobj(my_array_data, 'training-data-lstm', f'processed_data_1/test_frames_{file_type}_{i}_{j}.pkl')

x_1_1
x_1_2
x_1_3
x_1_4
x_2_1
x_2_2
x_2_3
x_2_4
x_3_1
x_3_2
x_3_3
x_3_4
x_4_1
x_4_2
x_4_3
x_4_4
x_5_1
x_5_2
x_5_3
x_5_4
x_6_1
x_6_2
x_6_3
x_6_4
x_7_1
x_7_2
x_7_3
x_7_4
x_8_1
x_8_2
x_8_3
x_8_4
y_1_1
y_1_2
y_1_3
y_1_4
y_2_1
y_2_2
y_2_3
y_2_4
y_3_1
y_3_2
y_3_3
y_3_4
y_4_1
y_4_2
y_4_3
y_4_4
y_5_1
y_5_2
y_5_3
y_5_4
y_6_1
y_6_2
y_6_3
y_6_4
y_7_1
y_7_2
y_7_3
y_7_4
y_8_1
y_8_2
y_8_3
y_8_4
embed_1_1
embed_1_2
embed_1_3
embed_1_4
embed_2_1
embed_2_2
embed_2_3
embed_2_4
embed_3_1
embed_3_2
embed_3_3
embed_3_4
embed_4_1
embed_4_2
embed_4_3
embed_4_4
embed_5_1
embed_5_2
embed_5_3
embed_5_4
embed_6_1
embed_6_2
embed_6_3
embed_6_4
embed_7_1
embed_7_2
embed_7_3
embed_7_4
embed_8_1
embed_8_2
embed_8_3
embed_8_4
decoder_input_1_1
decoder_input_1_2
decoder_input_1_3
decoder_input_1_4
decoder_input_2_1
decoder_input_2_2
decoder_input_2_3
decoder_input_2_4
decoder_input_3_1
decoder_input_3_2
decoder_input_3_3
decoder_input_3_4
decoder_input_4_1
decoder_input_4_2
decoder_input_4_3
decoder_input_4_4
decoder_

In [1]:
from s3fs.core import S3FileSystem
s3 = S3FileSystem()
import numpy as np
import os
test_arr = np.load(s3.open(os.path.join("s3://training-data-lstm/processed_data/", "train_frames_y_1_1"+".pkl")), allow_pickle=True)

In [2]:
test_arr.shape

(9420,)

In [3]:
test_arr1 = np.concatenate([sample for sample in test_arr], axis=0)

In [4]:
test_arr1.shape

(333100, 3, 3)

In [48]:
for sample in X[0]:
    print(sample.shape)

(67, 14, 15)
(2, 14, 15)
(28, 14, 15)
(21, 14, 15)
(3, 14, 15)
(20, 14, 15)
(7, 14, 15)
(3, 14, 15)
(17, 14, 15)
(23, 14, 15)
(28, 14, 15)
(12, 14, 15)
(2, 14, 15)
(24, 14, 15)
(8, 14, 15)
(76, 14, 15)
(8, 14, 15)
(35, 14, 15)
(16, 14, 15)
(9, 14, 15)
(14, 14, 15)
(10, 14, 15)
(6, 14, 15)
(32, 14, 15)
(37, 14, 15)
(86, 14, 15)
(112, 14, 15)
(6, 14, 15)
(4, 14, 15)
(183, 14, 15)
(49, 14, 15)
(130, 14, 15)


In [50]:
for sample in X[2]:
    print(sample.shape)

(67, 3, 3)
(2, 3, 3)
(28, 3, 3)
(21, 3, 3)
(3, 3, 3)
(20, 3, 3)
(7, 3, 3)
(3, 3, 3)
(17, 3, 3)
(23, 3, 3)
(28, 3, 3)
(12, 3, 3)
(2, 3, 3)
(24, 3, 3)
(8, 3, 3)
(76, 3, 3)
(8, 3, 3)
(35, 3, 3)
(16, 3, 3)
(9, 3, 3)
(14, 3, 3)
(10, 3, 3)
(6, 3, 3)
(32, 3, 3)
(37, 3, 3)
(86, 3, 3)
(112, 3, 3)
(6, 3, 3)
(4, 3, 3)
(183, 3, 3)
(49, 3, 3)
(130, 3, 3)


In [32]:
np.load("s3://training-data-lstm/processed_data/train_frames_x_1_1.pkl", allow_pickle=True)

FileNotFoundError: [Errno 2] No such file or directory: 's3://training-data-lstm/processed_data/train_frames_x_1_1.pkl'

In [33]:
import pandas as pd
data = pd.read_csv("s3://training-data-lstm/keyword_level_complete_data2 (1).csv")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
