In [1]:
%pip install --upgrade -q aiobotocore
%pip install -q  xgboost==1.3.1

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
#INSTANTIATE THE S3 CLIENTOBJECT AND LOCATIONS INSISDE DEFAULT S3 BUCKET
#DEFAULT BUCKER HAS NAME: sagemaker-<region>-<account-id>
#<READ> DATASET USED FOR TRAINING SORED IN A PUBLIC S3 BUCKET: sagemaker-sample-files

In [None]:
import pandas as pd
import boto3
import sagemaker
import json
import joblib
import xgboost as xgb #CLASSIFIER: XGBOOST
from sklearn.metrics import roc_auc_score

# Set SageMaker and S3 client variables
sess = sagemaker.Session()

region = sess.boto_region_name
s3_client = boto3.client("s3", region_name=region)

sagemaker_role = sagemaker.get_execution_role()

# Set read and write S3 buckets and locations
write_bucket = sess.default_bucket()
write_prefix = "fake-review-detect-demo"

read_bucket = "sagemaker-sample-files"
read_prefix = "datasets/tabular/amazon_reviews" 

train_data_key = f"{read_prefix}/train.csv"
test_data_key = f"{read_prefix}/test.csv"
model_key = f"{write_prefix}/model"
output_key = f"{write_prefix}/output"

train_data_uri = f"s3://{read_bucket}/{train_data_key}"
test_data_uri = f"s3://{read_bucket}/{test_data_key}"

In [None]:
#TRAIN THE XGBOOST MODEL
#-> LABEL COLUMN IS TARGET COLUMN
#-> HYPERPARAMETER TUNING: OF XGBOOST; METRIC = ROC-AUC

In [None]:
#TO BE TUNED?
hyperparams = {
                "max_depth": 3,
                "eta": 0.2,
                "objective": "binary:logistic",
                "subsample" : 0.8,
                "colsample_bytree" : 0.8,
                "min_child_weight" : 3
              }

num_boost_round = 100
nfold = 3
early_stopping_rounds = 10



# Set up data input
label_col = "LABEL" #TARGET COLUMN
data = pd.read_csv(train_data_uri)

# Read training data and target
train_features = data.drop(label_col, axis=1) #ALL EXCEPT LABEL?
train_label = pd.DataFrame(data[label_col])
dtrain = xgb.DMatrix(train_features, label=train_label)

# Cross-validate on training data
cv_results = xgb.cv(
    params=hyperparams,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    nfold=nfold,
    early_stopping_rounds=early_stopping_rounds,
    metrics=["auc"], #METRIC IS AREA UNDER CURVE
    seed=10,
)


metrics_data = {
    "binary_classification_metrics": {
        "validation:auc": {
            "value": cv_results.iloc[-1]["test-auc-mean"],
            "standard_deviation": cv_results.iloc[-1]["test-auc-std"]
        },
        "train:auc": {
            "value": cv_results.iloc[-1]["train-auc-mean"],
            "standard_deviation": cv_results.iloc[-1]["train-auc-std"]
        },
    }
}

#CROSS VALIDATED AUC SCORES ON TRAINING AND TESTING DATA
#-> DIFFERENCE BETWEEN TWO; POSSIBLE OVERFITTING ON TRAINING DATA
print(f"Cross-validated train-auc:{cv_results.iloc[-1]['train-auc-mean']:.2f}")
print(f"Cross-validated validation-auc:{cv_results.iloc[-1]['test-auc-mean']:.2f}")



In [None]:
data = pd.read_csv(test_data_uri)
test_features = data.drop(label_col, axis=1)
test_label = pd.DataFrame(data[label_col])
dtest = xgb.DMatrix(test_features, label=test_label)

model = (xgb.train(params=hyperparams, dtrain=dtrain, evals = [(dtrain,'train'), (dtest,'eval')], num_boost_round=num_boost_round, 
                  early_stopping_rounds=early_stopping_rounds, verbose_eval = 0)
        )

# Test model performance on train and test sets
test_pred = model.predict(dtest)
train_pred = model.predict(dtrain)

test_auc = roc_auc_score(test_label, test_pred)
train_auc = roc_auc_score(train_label, train_pred)

print(f"Train-auc:{train_auc:.2f}, Test-auc:{test_auc:.2f}")


In [2]:
#SAVE THE EVALUATION METRICS AS A JSON FILE
#SAVE THE TRAINED MODEL AS A PICKLE FILE
#-> BOTH TO LOCAL DIRECTORY WITHING SAGEMAKER STUDIE & DEFAULT S3 BUCKET

In [None]:
# Save model and performance metrics locally

with open("./metrics.json", "w") as f:
    json.dump(metrics_data, f)

with open("./xgboost-model", "wb") as f:
    joblib.dump(model, f)    
    
# Upload model and performance metrics to S3

metrics_location = output_key + "/metrics.json"
model_location = model_key + "/xgboost-model"

s3_client.upload_file(Filename="./metrics.json", Bucket=write_bucket, Key=metrics_location)
s3_client.upload_file(Filename="./xgboost-model", Bucket=write_bucket, Key=model_location)
