In [17]:
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
role

'arn:aws:iam::011528297661:role/service-role/AmazonSageMaker-ExecutionRole-20240730T170025'

In [13]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost import XGBoostPredictor
import boto3
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [46]:
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Prepare data for SageMaker
train_data = pd.DataFrame(np.column_stack((y_train, X_train)))
test_data = pd.DataFrame(np.column_stack((y_test, X_test)))

# Save data to CSV
train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

In [19]:
prefix = 'xgboost-classification'
bucket = sess.default_bucket()
bucket

'sagemaker-us-east-1-011528297661'

In [48]:
train_s3 = sess.upload_data('train.csv', bucket, f'{prefix}/train')
test_s3 = sess.upload_data('test.csv', bucket, f'{prefix}/test')

In [49]:
# Set up S3 inputs
s3_input_train = TrainingInput(s3_data=train_s3, content_type="csv")
s3_input_validation = TrainingInput(s3_data=test_s3, content_type="csv")

In [50]:
# Get the XGBoost container
container = sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "1.5-1")
container

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

In [51]:
# Set up the estimator
xgb = Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sess,
)
xgb

<sagemaker.estimator.Estimator at 0x7fc43a7e1810>

In [52]:
# Set hyperparameters
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=100,
)

# Train the model
xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-07-31-22-05-56-572


2024-07-31 22:05:56 Starting - Starting the training job...
2024-07-31 22:06:13 Starting - Preparing the instances for training...
2024-07-31 22:06:45 Downloading - Downloading input data...
2024-07-31 22:07:15 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2024-07-31 22:08:26.508 ip-10-0-187-79.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-07-31 22:08:26.533 ip-10-0-187-79.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-07-31:22:08:26:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-07-31:22:08:26:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-07-31:22:08:26:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-07-31:22:08:26:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2024-07-31:22:08:26:INFO] Determined 0 

In [53]:
# Deploy the model
predictor = xgb.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

# Print the endpoint name
print(f"Endpoint Name: {predictor.endpoint_name}")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-07-31-22-09-09-075
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-07-31-22-09-09-075
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-07-31-22-09-09-075


-------!Endpoint Name: sagemaker-xgboost-2024-07-31-22-09-09-075


In [54]:
# Get the endpoint URL
sagemaker_client = boto3.client('sagemaker')
endpoint_description = sagemaker_client.describe_endpoint(EndpointName=predictor.endpoint_name)
endpoint_url = endpoint_description['EndpointConfigName']

print(f"Endpoint URL: https://runtime.sagemaker.{sess.boto_region_name}.amazonaws.com/endpoints/{predictor.endpoint_name}/invocations")

Endpoint URL: https://runtime.sagemaker.us-east-1.amazonaws.com/endpoints/sagemaker-xgboost-2024-07-31-22-09-09-075/invocations


In [59]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Configure predictor to automatically handle CSV serialization and JSON deserialization
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

# Prepare test data for prediction, excluding the label column
test_data_array = test_data.values[:, 1:]  # Exclude the label column

# Make predictions
predictions = predictor.predict(test_data_array)
predictions['predictions'][:5]

[{'score': 0.14405474066734314},
 {'score': 0.9774831533432007},
 {'score': 0.737284779548645},
 {'score': 0.7516846656799316},
 {'score': 0.01202067919075489}]

In [61]:
predictions_array = [x['score'] for x in predictions['predictions']]
predictions_array[:5]

[0.14405474066734314,
 0.9774831533432007,
 0.737284779548645,
 0.7516846656799316,
 0.01202067919075489]

In [63]:
# Convert predictions to binary class (assuming the response is a JSON that includes prediction probabilities)
binary_predictions = (np.array(predictions_array) > 0.5).astype(int)
binary_predictions[:5]

array([0, 1, 1, 1, 0])

In [64]:
y_test[:5]

array([0, 1, 1, 1, 0])

In [65]:
# Print classification report
print(classification_report(y_test, binary_predictions))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87        89
           1       0.92      0.86      0.89       111

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.88       200
weighted avg       0.88      0.88      0.88       200



In [66]:
# Clean up
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2024-07-31-22-09-09-075
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-07-31-22-09-09-075
