In [30]:
import boto3
import pandas as pd

bucket_name = '905418408481-pja-asi-14c-gr4'

s3_client = boto3.client('s3')
s3_client.download_file(bucket_name, 'CrabAgePrediction.csv', 'CrabAgePrediction.csv')

data = pd.read_csv('CrabAgePrediction.csv')

In [31]:
print(data.info())
print(data.describe())
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3893 entries, 0 to 3892
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             3893 non-null   object 
 1   Length          3893 non-null   float64
 2   Diameter        3893 non-null   float64
 3   Height          3893 non-null   float64
 4   Weight          3893 non-null   float64
 5   Shucked Weight  3893 non-null   float64
 6   Viscera Weight  3893 non-null   float64
 7   Shell Weight    3893 non-null   float64
 8   Age             3893 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 273.9+ KB
None
            Length     Diameter       Height       Weight  Shucked Weight  \
count  3893.000000  3893.000000  3893.000000  3893.000000     3893.000000   
mean      1.311306     1.020893     0.349374    23.567275       10.207342   
std       0.300431     0.248233     0.104976    13.891201        6.275275   
min       0.187500    

In [32]:
data['Sex'] = data['Sex'].map({'M': 0, 'F': 1, 'I': 2})

In [33]:
from sklearn.model_selection import train_test_split

X = data.drop('Age', axis=1)
y = data['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)

In [34]:
train_data = pd.concat([y_train.reset_index(drop=True), X_train.reset_index(drop=True)], axis=1)
test_data = pd.concat([y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)

train_data.to_csv('train_data.csv', index=False, header=False)
test_data.to_csv('test_data.csv', index=False, header=False)

In [35]:
s3_client.upload_file('train_data.csv', bucket_name, 'train/train_data.csv')
s3_client.upload_file('test_data.csv', bucket_name, 'test/test_data.csv')

In [36]:
import sagemaker

container = sagemaker.image_uris.retrieve(
    framework='xgboost', 
    region=boto3.Session().region_name, 
    version='1.7-1'
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [37]:
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"
}

xgb_estimator = Estimator(
    image_uri=container,
    role=get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=f's3://{bucket_name}/output',
    hyperparameters=hyperparameters
)

In [38]:
from sagemaker.inputs import TrainingInput

In [39]:
train_input = TrainingInput(f's3://{bucket_name}/train/train_data.csv', content_type='csv')
test_input = TrainingInput(f's3://{bucket_name}/test/test_data.csv', content_type='csv')

xgb_estimator.fit({'train': train_input, 'validation': test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-06-20-06-31-49-957


2024-06-20 06:31:50 Starting - Starting the training job...
2024-06-20 06:32:09 Starting - Preparing the instances for training...
2024-06-20 06:32:39 Downloading - Downloading input data...
2024-06-20 06:33:14 Downloading - Downloading the training image......
2024-06-20 06:34:04 Training - Training image download completed. Training in progress...[34m[2024-06-20 06:34:27.120 ip-10-0-113-40.eu-west-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-06-20 06:34:27.142 ip-10-0-113-40.eu-west-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-06-20:06:34:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-06-20:06:34:27:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-06-20:06:34:27:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-06-20:06:34:27:INFO] Running XGBoost Sagemak

In [40]:
predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-06-20-06-35-35-394
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-06-20-06-35-35-394
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-06-20-06-35-35-394


------!

In [42]:
import numpy as np
import pandas as pd
import io
import boto3

sample_data = pd.DataFrame([
    [0, 1.4375, 1.175, 0.4125, 24.6357155, 12.3320325, 5.5848515, 6.747181],
    [1,1.6375,1.2875,0.5,42.354153,20.56756225,8.7599955,11.4815475],
    [2,1.6375,1.2875,0.3625,35.436875,14.92601175,8.0229085,8.9300925],
    [0,1.125,0.8375,0.35,13.11164375,4.649318,2.154562,4.252425]
], columns=['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight'])

csv_buffer = io.StringIO()
sample_data.to_csv(csv_buffer, index=False, header=False)

payload = csv_buffer.getvalue().encode('utf-8')

print(payload)

runtime = boto3.client('sagemaker-runtime')

response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType='text/csv',
    Body=payload
)

result = response['Body'].read().decode('utf-8')
print(result)

b'0,1.4375,1.175,0.4125,24.6357155,12.3320325,5.5848515,6.747181\n1,1.6375,1.2875,0.5,42.354153,20.56756225,8.7599955,11.4815475\n2,1.6375,1.2875,0.3625,35.436875,14.92601175,8.0229085,8.9300925\n0,1.125,0.8375,0.35,13.11164375,4.649318,2.154562,4.252425\n'
9.297533988952637
10.998446464538574
12.54384994506836
10.815958976745605



In [43]:
print(train_data.describe())
print(test_data.describe())

               Age          Sex       Length     Diameter       Height  \
count  2530.000000  2530.000000  2530.000000  2530.000000  2530.000000   
mean      9.922925     0.960474     1.308651     1.019111     0.348839   
std       3.225048     0.824700     0.300837     0.248631     0.108452   
min       2.000000     0.000000     0.275000     0.225000     0.037500   
25%       8.000000     0.000000     1.125000     0.875000     0.287500   
50%      10.000000     1.000000     1.362500     1.062500     0.350000   
75%      11.000000     2.000000     1.537500     1.200000     0.412500   
max      29.000000     2.000000     2.037500     1.625000     2.825000   

            Weight  Shucked Weight  Viscera Weight  Shell Weight  
count  2530.000000     2530.000000     2530.000000   2530.000000  
mean     23.446941       10.161693        5.108877      6.752722  
std      13.812752        6.251106        3.097543      3.895523  
min       0.226796        0.070874        0.056699      0.085048 

In [44]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2024-06-20-06-35-35-394
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-06-20-06-35-35-394
