In [46]:
import os
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split

SETTINGS

In [47]:
bucket_name = 'sagemaker-bucket-ds'
project_path_s3 = 'training-jobs'

All data used in training jobs must be in a specific format:
- Data in csv format (',' as separator)
- No headers
- Target as the first column

LOAD IRIS DATA

In [48]:
iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df["class"] = pd.Series(iris.target)
df = df[df['class'].isin([0, 1])] # Lets keep only class 0 and 1 to have binary classification
df = df[[list(df.columns)[-1]] + list(df.columns)[:-1]] # Reorder target as the first column
df.columns = df.columns.str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df.head()

Unnamed: 0,class,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


SPLIT INTO TRAIN TEST

In [49]:
train_df, test_df = train_test_split(df, test_size=0.33, random_state=42, stratify=df["class"])

SAVE DATA INTO LOCAL FOLDER

In [50]:
os.makedirs("DATA", exist_ok=True) # Create data folder

iris_train = train_df.to_numpy()
np.savetxt('./DATA/iris_train.csv', iris_train, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')

iris_test = test_df.to_numpy()
np.savetxt('./DATA/iris_test.csv', iris_test, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')

column_names_list = ','.join(df.columns)
with open('./DATA/column_names.csv', 'w') as file:
    file.write(column_names_list)

DISPLAY TRAIN

In [51]:
!head ./DATA/iris_train.csv

1.0, 5.900, 3.200, 4.800, 1.800
1.0, 5.800, 2.700, 3.900, 1.200
1.0, 6.700, 3.100, 4.400, 1.400
1.0, 6.200, 2.900, 4.300, 1.300
0.0, 5.500, 4.200, 1.400, 0.200
0.0, 5.200, 3.500, 1.500, 0.200
0.0, 5.000, 3.500, 1.300, 0.300
1.0, 6.800, 2.800, 4.800, 1.400
0.0, 5.400, 3.400, 1.500, 0.400
1.0, 6.900, 3.100, 4.900, 1.500


DISPLAY TEST

In [52]:
!head ./DATA/iris_train.csv

1.0, 5.900, 3.200, 4.800, 1.800
1.0, 5.800, 2.700, 3.900, 1.200
1.0, 6.700, 3.100, 4.400, 1.400
1.0, 6.200, 2.900, 4.300, 1.300
0.0, 5.500, 4.200, 1.400, 0.200
0.0, 5.200, 3.500, 1.500, 0.200
0.0, 5.000, 3.500, 1.300, 0.300
1.0, 6.800, 2.800, 4.800, 1.400
0.0, 5.400, 3.400, 1.500, 0.400
1.0, 6.900, 3.100, 4.900, 1.500


COPY DATA INTO S3

In [53]:
data_s3_path  = os.path.join('s3://', bucket_name, project_path_s3, 'data') 
train_s3_path = os.path.join(data_s3_path, 'train', 'iris_train.csv')
test_s3_path = os.path.join(data_s3_path, 'test', 'iris_test.csv')
columns_s3_path = os.path.join(data_s3_path, 'column_names.csv')

!aws s3 cp ./DATA/iris_train.csv {train_s3_path}
!aws s3 cp ./DATA/iris_test.csv {test_s3_path}
!aws s3 cp ./DATA/column_names.csv {columns_s3_path}

upload: DATA/iris_train.csv to s3://sagemaker-bucket-ds/training-jobs/data/train/iris_train.csv
upload: DATA/iris_test.csv to s3://sagemaker-bucket-ds/training-jobs/data/test/iris_test.csv
upload: DATA/column_names.csv to s3://sagemaker-bucket-ds/training-jobs/data/column_names.csv


CHECK WHETHER FILES ARE ON S3

In [54]:
!aws s3 ls s3://sagemaker-bucket-ds/training-jobs/data/ --recursive

2024-06-27 08:45:27         67 training-jobs/data/column_names.csv
2024-06-27 08:45:25       1056 training-jobs/data/test/iris_test.csv
2024-06-27 08:45:24       2144 training-jobs/data/train/iris_train.csv
