### 1. Setup environment

In [3]:
# This section sets up the environment by running the environment setup script, which ensures that all required libraries and configurations are loaded.
%run 0-Environment_Setup.ipynb

[0msagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


### 2. Set session variables

In [5]:
# This section initializes the session variables required for interacting with SageMaker, such as the session, role, region, and default S3 bucket for storing data.
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_session.region_name
bucket = sagemaker_session.default_bucket()

train_prefix = "store-sales-forecasting/train"
test_prefix = "store-sales-forecasting/test"
val_prefix = "store-sales-forecasting/val"

### 3. Create a temporary directory for saving data

In [6]:
# This section creates a temporary directory to store local data files before uploading them to S3.
local_path = f"{os.getcwd()}/tmp"
if not os.path.exists(local_path):
    os.makedirs(local_path)


### 4. Pull the data from the feature store sorted by date and then store number

In [7]:
# This section pulls the dataset from the feature store, sorted by date and store number to maintain a chronological order for sales data.
sales_features_store_df = get_store_dataset_from_offline_feature_group_date_sort(store_sales_feature_group)
sales_features_store_df.head()

Running 
    SELECT *
    FROM
        "store_sales_feature_group_offline_1729084164"
    ORDER BY
        date ASC, store_nbr ASC
    


Unnamed: 0,date,store_nbr,sales,oil,onpromotion,is_holiday,city,state,cluster,year,...,month_sin,day_cos,day_sin,dow_cos,dow_sin,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2013-01-01,1,0.0,93.14,0,1,18,12,13,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:1,1729086000.0,2024-10-16 13:39:30.285,2024-10-16 13:36:55.000,False
1,2013-01-01,2,0.0,93.14,0,1,18,12,13,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:2,1729086000.0,2024-10-16 13:39:34.032,2024-10-16 13:36:55.000,False
2,2013-01-01,3,0.0,93.14,0,1,18,12,8,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:3,1729086000.0,2024-10-16 13:39:31.763,2024-10-16 13:36:55.000,False
3,2013-01-01,4,0.0,93.14,0,1,18,12,9,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:4,1729086000.0,2024-10-16 13:39:35.868,2024-10-16 13:36:55.000,False
4,2013-01-01,5,0.0,93.14,0,1,21,14,4,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:5,1729086000.0,2024-10-16 13:39:34.032,2024-10-16 13:36:55.000,False


### 5. Drop miscellaneous info

In [8]:
# This section drops unnecessary columns from the dataset, such as event metadata and redundant features, to clean up the data for further processing.
sales_features_store_df.drop(columns=[
    "sales_record_id", "event_time", "write_time", "api_invocation_time", "is_deleted", 
    "city-state", "city-state-cluster", "hash_0", "hash_1", "hash_2", "hash_3", "hash_4", 
    "hash_5", "hash_6", "hash_7", "hash_8", "hash_9"
], inplace=True)



In [9]:
sales_features_store_df.info

<bound method DataFrame.info of              date  store_nbr     sales    oil  onpromotion  is_holiday  city  \
0      2013-01-01          1      0.00  93.14            0           1    18   
1      2013-01-01          2      0.00  93.14            0           1    18   
2      2013-01-01          3      0.00  93.14            0           1    18   
3      2013-01-01          4      0.00  93.14            0           1    18   
4      2013-01-01          5      0.00  93.14            0           1    21   
...           ...        ...       ...    ...          ...         ...   ...   
90931  2017-08-15         50  16879.12  47.57           35           0     0   
90932  2017-08-15         51  20154.56  47.57           29           0     8   
90933  2017-08-15         52  18600.05  47.57           37           0    14   
90934  2017-08-15         53   8208.19  47.57           33           0    14   
90935  2017-08-15         54  12666.86  47.57           76           0     5   

       

### 6. Perform feature selection

In [10]:
# This section selects the features to be used in the model based on the findings from the EDA. These include key features like sales, promotions, holiday indicators, and store characteristics.
selected_features = [
    "sales", 
    "oil", 
    "onpromotion",
    "is_holiday", 
    "month_cos",
    "month_sin",
    "day_cos",
    "day_sin",
    "dow_cos",
    "dow_sin",
    "cluster"
]
sales_features_store_df = sales_features_store_df[selected_features + ["date", "store_nbr"]]


### 7. One-hot encode categorical features

In [11]:
# This section one-hot encodes categorical features like cluster to prepare them for the regression model.
sales_features_store_df = pd.get_dummies(sales_features_store_df, columns=["cluster"], drop_first=True)

### 8.Rolling Window Feature Engineering for Predicting 7 Days into the Future

In [12]:
# Next  we'll create rolling windows of data for training, with input sequences of 60 (or 90) days to predict the next 7 days of sales.
def create_rolling_windows(df, input_seq_length=60, target_seq_length=7):
    windows = []
    targets = []

    # Iterate over each store separately
    for store in df['store_nbr'].unique():
        store_data = df[df['store_nbr'] == store].sort_values(by='date')
        num_days = len(store_data)

        for i in range(0, num_days - input_seq_length - target_seq_length + 1):
            input_window = store_data.iloc[i:i + input_seq_length].drop(columns=['date', 'store_nbr', 'sales']).values
            target_window = store_data.iloc[i + input_seq_length:i + input_seq_length + target_seq_length]['sales'].values
            
            windows.append(input_window)
            targets.append(target_window)

    return np.array(windows), np.array(targets)

input_seq_length = 60
target_seq_length = 7
X, y = create_rolling_windows(sales_features_store_df, input_seq_length, target_seq_length)


### 9. Split the data into train/test/val sets 

In [13]:
# This section splits the dataset into training, testing, and validation sets to evaluate the model's performance.
n = len(X)
X_train, y_train = X[:int(n * 0.8)], y[:int(n * 0.8)]
X_test, y_test = X[int(n * 0.8):int(n * 0.9)], y[int(n * 0.8):int(n * 0.9)]
X_val, y_val = X[int(n * 0.9):], y[int(n * 0.9):]


### 10. Normalize Features Using StandardScaler

In [14]:
from sklearn import preprocessing

# This section normalizes the feature values using StandardScaler to ensure they are on the same scale, which helps improve model performance.

# Flatten the first two dimensions (stores * instances, features) to normalize correctly.
X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])
X_val_reshaped = X_val.reshape(-1, X_val.shape[-1])

# Initialize StandardScaler
scaler = preprocessing.StandardScaler()

# Normalize features for training, test, and validation datasets
X_train_reshaped = scaler.fit_transform(X_train_reshaped)
X_test_reshaped = scaler.transform(X_test_reshaped)
X_val_reshaped = scaler.transform(X_val_reshaped)

# Reshape back to original dimensions
X_train = X_train_reshaped.reshape(X_train.shape)
X_test = X_test_reshaped.reshape(X_test.shape)
X_val = X_val_reshaped.reshape(X_val.shape)



### 11. Save mean and standard deviation arrays to S3

In [15]:
# Save mean and standard deviation arrays to S3 for use during model training and inference
np.save(f"{local_path}/global_mean.npy", scaler.mean_)
np.save(f"{local_path}/global_stddev.npy", scaler.scale_)


In [16]:
sagemaker_session.upload_data(f"{local_path}/global_mean.npy", bucket=bucket, key_prefix=train_prefix)
sagemaker_session.upload_data(f"{local_path}/global_stddev.npy", bucket=bucket, key_prefix=train_prefix)

's3://sagemaker-us-east-2-612877486901/store-sales-forecasting/train/global_stddev.npy'

### 12. Save Normalized Data Splits Locally

In [17]:
# This section saves the training, testing, and validation data splits locally as CSV files before uploading them to S3.
np.save(f"{local_path}/X_train.npy", X_train)
np.save(f"{local_path}/y_train.npy", y_train)

np.save(f"{local_path}/X_test.npy", X_test)
np.save(f"{local_path}/y_test.npy", y_test)

np.save(f"{local_path}/X_val.npy", X_val)
np.save(f"{local_path}/y_val.npy", y_val)



### 13. Save data splits to S3

In [18]:
# This section uploads the training, testing, and validation data splits to S3 for use during model training.
sagemaker_session.upload_data(f"{local_path}/X_train.npy", bucket=bucket, key_prefix=train_prefix)
sagemaker_session.upload_data(f"{local_path}/y_train.npy", bucket=bucket, key_prefix=train_prefix)

sagemaker_session.upload_data(f"{local_path}/X_test.npy", bucket=bucket, key_prefix=test_prefix)
sagemaker_session.upload_data(f"{local_path}/y_test.npy", bucket=bucket, key_prefix=test_prefix)

sagemaker_session.upload_data(f"{local_path}/X_val.npy", bucket=bucket, key_prefix=val_prefix)
sagemaker_session.upload_data(f"{local_path}/y_val.npy", bucket=bucket, key_prefix=val_prefix)


's3://sagemaker-us-east-2-612877486901/store-sales-forecasting/val/y_val.npy'